Skip to content
Snippets Groups Projects
Commit 3e76c60a authored by Swastik Mishra's avatar Swastik Mishra
Browse files

Initial commit

parents
Branches
No related tags found
No related merge requests found
Showing
with 1905 additions and 0 deletions
arc/
scrap/
tmp*
nohup*
.txt
.vscode
pyrightconfig.json
__pycache__
# Ignore everything in this directory
data/*
plots/*
# But not the directory itself
!data/.gitkeep
!plots/.gitkeep
This diff is collapsed.
%% Cell type:markdown id: tags:
Run `download_ncbi_genome_sequences.py` like so:
```bash
nohup python src/download_ncbi_genome_sequences.py -i 1236_subset.taxa > nohup_download_genome_seqs.out & disown
```
By default, `-o` output dir is `../data/genome_sequences/` where fasta and gff files are downloaded for each taxon in the list.
Each of these files is of the form `{NCBI_taxon_ID}_{NCBI_genome_accession_ID}.{extension}`
The TSV file `genome_sequences/1236_subset_accession_ids.tsv` lists out the mapping between taxon_ID, accession_ID, as well as other information such as assembly name and source DB (sometimes it is not REFSEQ)
%% Cell type:code id: tags:
``` python
# import libraries
import itertools
```
%% Cell type:code id: tags:
``` python
# Then, prepare a multiple sequence fasta file for all of these genomes, and similarly such a file for all of the genes of interest.
# for the genomes, we just need to concatenate all the fasta files
# in the genome_sequences directory, i.e. the output dir of the previous script
# for the genes of interest, we need to first prepare a list of all the gene names.
# for this, first read in the members.tsv file
members_tsv_filepath = '1236_subset_NOG_members.tsv'
# read only the 6th column (CSV lists of the gene names)
with open(members_tsv_filepath) as fo:
flines = fo.readlines()
gene_names = [line.split('\t')[5] for line in flines]
# now, split the gene names into a list of lists
gene_names = [gn.split(',') for gn in gene_names]
# flatten this huge list of lists efficiently
gene_names = list(itertools.chain.from_iterable(gene_names))
# remove duplicates
gene_names = list(set(gene_names))
print(f'Found {len(gene_names)
} unique gene names. List looks like this:', gene_names[:10])
# write the gene names to a file, replacing in the members.tsv filepath, 'members' with 'genes' and 'tsv' with 'list'
gene_names_filepath = members_tsv_filepath.replace(
'members', 'genes').replace('tsv', 'list')
print(f'Writing gene names to {gene_names_filepath}')
with open(gene_names_filepath, 'w') as fo:
fo.write('\n'.join(gene_names) + '\n')
```
%% Output
Found 100692 unique gene names. List looks like this: ['1123053_GCA_000425345_02619', '128785_GCA_001556105_01030', '1123514_GCA_000381085_02040', '1441930_Z042_05715', '1449087_GCA_000711305_00098', '767434_Fraau_2269', '377629_TERTU_3187', '437022_CC99x_00673', '658445_H744_2c0221', '1265503_GCA_000378625_02015']
Writing gene names to 1236_subset_NOG_genes.list
import time
import os
import subprocess
from loguru import logger
import pandas as pd
from multiprocessing import Pool
import shutil
import traceback
import io
import zipfile
# Uses ncbi `datasets` to download each genome assembly sequence
def download_genome_sequences(args):
"""
Download genome sequences from NCBI using ncbi `datasets` CLI program
Args:
acc_id (str): assembly accession ID
output_dir (str): path to the output directory
n_retries (int): number of retries to perform
Returns:
None
"""
# unpack the arguments
taxon_id, acc_id, output_dir, n_retries, files_to_download, bin_dir = args
# Create the output directory if it does not exist
os.makedirs(output_dir, exist_ok=True)
# run ncbi `datasets`
try:
ncbi_retries = n_retries
while ncbi_retries > 0:
try:
logger.info(f"Running datasets for assembly accession ID: {
acc_id}, taxon ID: {taxon_id}")
download_process = subprocess.run(
f"{bin_dir}/datasets download genome accession {acc_id} --include {files_to_download} --filename {output_dir}/{acc_id}.zip", shell=True)
logger.info(
f"now extracting files for assembly accession ID: {acc_id}, taxon ID: {taxon_id}")
# if download_process didn't have success exit code, log error and try again
if download_process.returncode != 0:
logger.error(f"datasets failed for assembly accession ID: {acc_id}, taxon ID: {
taxon_id} with error code {download_process.returncode}")
if ncbi_retries == 0:
logger.error(f"no more retries for assembly accession ID: {acc_id}, taxon ID: {
taxon_id} after {n_retries} retries, skipping to the next assembly accession ID")
break
else:
logger.info(f"Retrying datasets for assembly accession ID: {
acc_id}, taxon ID: {taxon_id}, retries left: {ncbi_retries}")
ncbi_retries -= 1
continue
# extract the files from the zip archive, except jsonl files
# basically rename each of the files to {taxon_id}_{acc_id}_{original_file_name}
with zipfile.ZipFile(f"{output_dir}/{acc_id}.zip", 'r') as zip_ref:
if len(zip_ref.namelist()) == 0:
logger.error(f"no files found in zip archive for assembly accession ID: {acc_id}, taxon ID: {taxon_id}")
break
for file_i in zip_ref.namelist():
if file_i.endswith('.jsonl') or file_i.endswith('.json') or file_i.endswith('README.md'):
continue
logger.info(f"extracting file: {file_i} for assembly accession ID: {acc_id}, taxon ID: {taxon_id}")
with zip_ref.open(file_i) as f:
file_i_basename = os.path.basename(file_i)
# check if file_i_basename starts with acc_id
if file_i_basename.startswith(acc_id):
with open(f"{output_dir}/{taxon_id}_{file_i_basename}", 'wb') as f_out:
f_out.write(f.read())
else:
with open(f"{output_dir}/{taxon_id}_{acc_id}_{file_i_basename}", 'wb') as f_out:
f_out.write(f.read())
# if the command runs successfully, break out of the while loop
break
except subprocess.CalledProcessError as e:
logger.error(f"datasets failed for assembly accession ID: {
acc_id}, taxon ID: {taxon_id} with error\n {e}")
if ncbi_retries == 0:
logger.error(
f"No more retries left for assembly accession ID: {acc_id}, taxon ID: {taxon_id} after {n_retries} retries, skipping to the next assembly accession ID")
break
else:
logger.info(
f"Retrying datasets for assembly accession ID: {acc_id}, taxon ID: {taxon_id}, retries left: {ncbi_retries}")
ncbi_retries -= 1
continue
except subprocess.CalledProcessError as e:
logger.error(f"datasets failed with error\n {e}")
return None
#
return taxon_id
def retrieve_accession_info(args):
"""
Fn for `multiprocessing.Pool` to retrieve assembly accession ID for a given taxon ID.
"""
taxon_id, n_retries, bin_dir = args
try:
datasets_retries = n_retries
while datasets_retries > 0:
try:
logger.info(f"Running datasets for taxon ID: {taxon_id}")
#
p1 = subprocess.Popen(
f"{bin_dir}/datasets summary genome taxon {taxon_id} --as-json-lines", shell=True, stdout=subprocess.PIPE)
logger.info(
f"now running dataformat for taxon ID: {taxon_id}")
tsv_records_process = subprocess.run(
f"{bin_dir}/dataformat tsv genome --fields accession,assminfo-name,source_database,assminfo-refseq-category", shell=True, stdin=p1.stdout, stdout=subprocess.PIPE)
# if the command runs successfully, break out of the while loop
if tsv_records_process.returncode == 0:
if len(tsv_records_process.stdout) == 0:
logger.info(
f"No records found for taxon ID: {taxon_id}")
return None
else:
tsv_records = tsv_records_process.stdout.decode(
'utf-8').split('\n')
break
else:
logger.error(
f"dataformat failed for taxon ID: {taxon_id} with returncode {tsv_records_process.returncode} and error:\n {tsv_records_process.stderr}")
datasets_retries -= 1
if datasets_retries == 0:
logger.error(
f"datasets failed for taxon ID: {taxon_id} after {n_retries} retries for taxon ID: {taxon_id}.")
return None
except subprocess.CalledProcessError as e:
logger.error(f"datasets failed for taxon ID: {
taxon_id} with error\n {get_exception_traceback_str(e)}")
datasets_retries -= 1
if datasets_retries == 0:
logger.error(
f"datasets failed for taxon ID: {taxon_id} after {n_retries} retries for taxon ID: {taxon_id}.")
return None
else:
logger.info(
f"Retrying datasets for taxon ID: {taxon_id}, retries left: {datasets_retries}")
continue
# remove header
if tsv_records[0].startswith('Assembly Accession'):
tsv_records = tsv_records[1:]
# if there are no records, log an error and continue to the next taxon ID
if len(tsv_records) == 0:
logger.info(f"No records found for taxon ID: {taxon_id}")
# this typically means that the ID itself doesn't have a genome associated with it
# but the ID is valid and could be a strain ID.
# if there are records, log the number of records found
logger.info(f"Found {len(tsv_records)
} records for taxon ID: {taxon_id}")
try:
if tsv_records[-1] == '':
tsv_records = tsv_records[:-1]
# for each record, extract the assembly accession ID and source database
# for record in tsv_records:
# print('\n'.join(tsv_records))
# if there's only one record, that is the acc_id
if len(tsv_records) == 1:
assembly_name = tsv_records[0].split('\t')[1]
# elif there are 2 or more records,
# check if one of them is RefSeq or if one of the RefSeq records has category of 'representative genome' or 'reference genome'
elif len(tsv_records) >= 2:
refseq_record = None
for record in tsv_records:
if record.split('\t')[2].endswith('REFSEQ'):
refseq_record = record
if record.split('\t')[3] in ['representative genome', 'reference genome']:
assembly_name = tsv_records[0].split('\t')[1]
break
refseq_record = None
if refseq_record is None:
assembly_name = tsv_records[0].split('\t')[1]
# now use the assembly_name to get the acc_id of the record
# i.e. record in tsv_records with the same assembly_name but from GENBANK
# if there's no record from GENBANK, use the first record
taxon_record = [record for record in tsv_records if record.split(
'\t')[1] == assembly_name and record.split('\t')[2] == 'GENBANK']
if len(taxon_record) == 0:
taxon_record = tsv_records[0]
else:
taxon_record = taxon_record[0]
return {'taxon_id': taxon_id, 'acc_id': taxon_record.split('\t')[0],
'assembly_name': taxon_record.split('\t')[1], 'source_db': taxon_record.split('\t')[2]}
except Exception as e:
logger.info(
f"TSV records list that was not processed: {tsv_records}")
logger.error(f"Failed to extract acc_id for taxon ID: {
taxon_id}. Traceback\n {get_exception_traceback_str(e)}")
return None
except Exception as e:
logger.error(f"datasets failed for taxon ID: {
taxon_id}. Traceback\n {get_exception_traceback_str(e)}")
return None
def get_exception_traceback_str(exc: Exception) -> str:
# Ref: https://stackoverflow.com/a/76584117/
iofile = io.StringIO()
traceback.print_exception(exc, file=iofile)
return iofile.getvalue().rstrip()
def process_taxa_ids(input_file, taxon_ids_list, output_dir, threads, n_retries, files_to_download, bin_dir, acc_id_tsv_filepath):
"""
This function is used to download genome sequences from NCBI using ncbi-genome-download.
Args:
input_file (str): path to the input file containing taxon IDs
taxon_ids_list (list): list of taxon IDs
output_dir (str): path to the output directory
threads (int): number of threads to use
n_retries (int): number of retries to perform if failures are encountered
files_to_download (str): files to download from NCBI
bin_dir (str): path to the directory where the `datasets` and `dataformat` CLI programs are located
acc_id_tsv_filepath (str): path to the file containing assembly accession IDs
debug (bool): run in debug mode
Returns:
None: if the function runs successfully
"""
to_download_info = False
# if the acc_id_tsv_filepath is provided, read in the assembly accession IDs from the file and use this df to download genome/gff/etc.
if acc_id_tsv_filepath is not None:
logger.info(f"Assembly accession IDs file provided, reading in assembly accession IDs from file: {acc_id_tsv_filepath}")
taxon_id_acc_id_df = pd.read_csv(acc_id_tsv_filepath, sep='\t')
taxon_id_acc_id_df['taxon_id'] = taxon_id_acc_id_df['taxon_id'].astype(str)
logger.info(f"Read in assembly accession IDs. Number of records: {len(taxon_id_acc_id_df)}")
acc_id_tsv_taxon_ids_list = taxon_id_acc_id_df['taxon_id'].tolist()
taxa_without_acc_list = set(taxon_ids_list) - set(acc_id_tsv_taxon_ids_list)
logger.info(f"taxon IDs list looks like this: {taxon_ids_list[:5]}, and in acc_id_tsv file: {acc_id_tsv_taxon_ids_list[:5]}. Difference: {len(taxa_without_acc_list)}")
# check if the taxon IDs in the file are the same as the taxon IDs in the input file. If not, then to_download_info = True
if len(taxa_without_acc_list) > 0:
to_download_info = True
logger.info(f"{acc_id_tsv_filepath} doesn't have Acc IDs for {len(taxa_without_acc_list)} taxon IDs that are in the input file")
else:
to_download_info = False
else:
taxon_id_acc_id_df = pd.DataFrame(columns=['taxon_id', 'acc_id', 'assembly_name', 'source_db'])
to_download_info = True
taxa_without_acc_list = taxon_ids_list
if to_download_info: # if the assembly accession IDs file contains taxon IDs that are not in the input file, run datasets to get assembly accession IDs for these taxon IDs
logger.info(f"Running datasets to get assembly accession IDs for each taxon ID")
# run ncbi 'datasets' to query for assembly accession IDs for each taxon ID, and write it into a pandas dataframe
taxon_id_acc_id_df2 = pd.DataFrame(columns=['taxon_id', 'acc_id', 'assembly_name', 'source_db'])
taxon_id_acc_id_list = []
# use multiprocessing to run the datasets function for each taxon ID, and write the results to a dataframe
with Pool(processes=threads) as pool:
for pool_out in pool.imap(retrieve_accession_info, [(taxon_id, n_retries, bin_dir) for taxon_id in taxa_without_acc_list]):
if pool_out is not None:
taxon_id_acc_id_list.append(pool_out)
taxon_id_acc_id_df2 = pd.DataFrame(taxon_id_acc_id_list)
# remove rows where duplicates of the acc_id column exist
taxon_id_acc_id_df2.drop_duplicates(subset='acc_id', inplace=True)
# add these new records to the existing taxon_id_acc_id_df
taxon_id_acc_id_df = pd.concat([taxon_id_acc_id_df, taxon_id_acc_id_df2], ignore_index=True)
# write the new taxon_id_acc_id_df to a file
taxon_id_acc_id_df.to_csv(
f"{output_dir}/{input_file.split('/')[-1].split('.')[0]}_accession_ids.tsv", sep='\t', index=False, quoting=1)
logger.info(f"Assembly accession IDs written to file: {output_dir}/{input_file.split('/')[-1].split('.')[0]}_accession_ids.tsv")
# if the taxon_id_acc_id_df is empty and there was no original acc_id_tsv_filepath provided, log an error and exit
if taxon_id_acc_id_df.empty and acc_id_tsv_filepath is None:
logger.error(f"No assembly accession IDs found for any taxon ID, exiting")
return None
elif taxon_id_acc_id_df.empty and acc_id_tsv_filepath is not None:
logger.error(f"No assembly accession IDs found for any additional taxon ID. Continuing with the original assembly accession IDs file")
# now we have the assembly accession IDs, we can use ncbi-genome-download to download the genome sequences, in parallel
# create args list for parallel processing
tax_acc_tuples_list = list(
zip(taxon_id_acc_id_df['taxon_id'], taxon_id_acc_id_df['acc_id']))
logger.info(f"Number of assembly accession IDs in dataframe: {len(taxon_id_acc_id_df)}")
# which `files_to_download` files are already downloaded in output_dir?
files_to_download_list = files_to_download.split(',')
extensions_dict = {'genome': '.fna', 'gff3': '.gff',
'gbff': '.gbff', 'protein': '.faa',
'cds': '.ffn', 'rna': '.fna',
'gtf': '.gtf'}
for file_type in files_to_download_list:
if not all([os.path.exists(f"{output_dir}/{taxon_id}_{acc_id}{extensions_dict[file_type]}") for taxon_id, acc_id in tax_acc_tuples_list]):
logger.info(f"Downloading {file_type} files")
tax_acc_tuples_list = [(taxon_id, acc_id) for taxon_id, acc_id in tax_acc_tuples_list if not os.path.exists(f"{output_dir}/{taxon_id}_{acc_id}{extensions_dict[file_type]}")]
logger.info(f"Number of assembly accession IDs to download: {len(tax_acc_tuples_list)}")
if len(tax_acc_tuples_list) == 0:
logger.info(f"All {file_type} files already downloaded, skipping")
continue
args_list = [(taxon_id, acc_id, output_dir, n_retries, files_to_download, bin_dir)
for taxon_id, acc_id in tax_acc_tuples_list]
logger.info(f"Downloading {files_to_download} from NCBI using {
threads} threads, input file: {input_file}, output directory: {output_dir}")
with Pool(processes=threads) as pool:
# iterate through what has been created via pool.imap()
for pool_out in pool.imap(download_genome_sequences, args_list):
if pool_out is not None:
logger.info(
f"Downloading {files_to_download} for taxon ID: {pool_out} finished successfully")
logger.info(f"Downloading genome sequences finished successfully")
return None
if __name__ == '__main__':
# log time
logger.info(f"Running {__file__} at {time.asctime()}")
start_time = time.time()
# import argparse and read in the arguments
import argparse
parser = argparse.ArgumentParser(
description='Download genome sequences from NCBI')
parser.add_argument(
'--input_file', '-i', help='Path to the input file containing taxon IDs', type=str, required=True)
parser.add_argument('--output_dir', '-o', help='Path to the output directory',
type=str, default='../data/genome_data/')
parser.add_argument('--threads', '-t',
help='Number of threads to use', type=int, default=100)
parser.add_argument('--to-download', type=str, default='genome,gff3',
help='Files to dload from NCBI. Default is genome,gff3. \
See NCBI datasets CLI documentation for more possibilities.')
parser.add_argument(
'--retries', '-r', help='Number of retries to perform if failures are encountered, for each taxon ID input.', type=int, default=5)
parser.add_argument(
'--debug', '-d', help='Run in debug mode (runs first <--threads> entries only)', action='store_true')
parser.add_argument(
'--bin-dir', '-b', type=str, default='/root/mambaforge/envs/hgt_analyses/bin/',
help='Path to dir where `datasets` and `dataformat` CLI programs are.')
parser.add_argument('--acc_ids_tsv', '-a',
type=str, default=None,
help='Path to the file containing assembly accession IDs')
args = parser.parse_args()
# Create the output directory if it does not exist
os.makedirs(args.output_dir, exist_ok=True)
# read in all the taxon IDs from the input file
with open(args.input_file, 'r') as f:
taxon_ids_list = [line.strip()
for line in f.readlines() if line.strip() != '']
# debugging: run only for first 'threads' lines using a tmp_ file
if args.debug:
logger.debug(f"Running in debug mode, using only first {
args.threads} taxon IDs")
taxon_ids_list = taxon_ids_list[:args.threads]
# run the main function
process_taxa_ids(args.input_file, taxon_ids_list, args.output_dir, args.threads, args.retries, args.to_download, args.bin_dir, args.acc_ids_tsv)
# log the time
end_time = time.time()
duration = end_time - start_time
human_readable_duration = time.strftime("%H:%M:%S", time.gmtime(duration))
logger.info(f"Finished running {__file__} in {human_readable_duration}")
#!/usr/bin/env python3
from subprocess import CalledProcessError, Popen
import os
import time
from datetime import timedelta, datetime
import argparse
import glob
from multiprocessing import Pool
def run_command(params):
"""
Inputs:
params: tuple of (shell_command, logFileObject)
Function:
Runs the shell command, and writes the output to the log file object
"""
cmd, log_file_path = params
with open(log_file_path, 'w') as log_FO:
process = Popen(cmd, stdout=log_FO, universal_newlines=True)
if process.wait() != 0:
raise CalledProcessError(process.returncode, cmd)
def MAD_roots_NOG_trees_parallel(input_file_path, mad_executable, n_lines, num_subprocesses):
"""
Inputs:
input_file_path: TSV format input file. Columns are: treeID(str) and tree(str)
mad_executable: path to MAD executable
n_lines: number of lines that the input file will be split into for each split file.
Outputs:
Writes out a file with the same name as the input file, but with a `.rooted` at the end of the filename.
Additional info:
This function splits the input file into multiple files, and runs MAD on each of these split files in parallel.
The number of processes that will be run in parallel is equal to the number of split files.
The number of lines in each split file is equal to the `n_lines` argument.
The output file will contain the same number of lines as the input file, but the second column will contain the rooted trees.
"""
# Check if the input file exists
if not os.path.isfile(input_file_path):
raise FileNotFoundError(f"The input file {input_file_path} does not seem to exist. Check the filepath provided.")
# Find the full path of the input file and extract the dir name
input_file_path = os.path.abspath(input_file_path)
input_dir = os.path.dirname(input_file_path)
# Get the current date and time
now = datetime.now()
# Format the current date and time as a string
timestamp = now.strftime("%Y%m%d_%H%M%S")
# Create a temporary directory path with the timestamp
tmp_dir = f"{input_dir}/tmp_{timestamp}"
# Create the temporary directory
os.mkdir(tmp_dir)
# log this with time, along with location of the tmp folder
print(f'Created the tmp folder at: {datetime.now()} at location: {tmp_dir}')
# Open the input file and read it in chunks of `n_lines` lines
nwk_split_files = [] # track the filenames of split files containing the second column (tree)
info_split_files = [] # track the filenames of split files containing the first column (treeID)
with open(input_file_path, 'r') as treesFO:
chunk = []
for i, line in enumerate(treesFO):
split_line = line.strip().split()
chunk.append(split_line)
if (i + 1) % n_lines == 0:
# Write out the split trees file
nwk_split_filepath = tmp_dir+'/tmp_tree_split.nwk.'+str((i // n_lines) + 1)
info_split_filepath = tmp_dir+'/tmp_tree_split.info.'+str((i // n_lines) + 1)
with open(nwk_split_filepath, 'w') as split_out_fo:
split_out_fo.write('\n'.join(line[1] for line in chunk) + '\n')
with open(info_split_filepath, 'w') as split_out_fo:
split_out_fo.write('\n'.join(line[0] for line in chunk) + '\n')
# track the filenames
nwk_split_files.append(nwk_split_filepath)
info_split_files.append(info_split_filepath)
chunk = []
# Write the last chunk if it's not empty
if chunk:
split_chunk = [line.split() for line in chunk]
nwk_split_filepath = tmp_dir+'/tmp_tree_split.nwk.'+str((i // n_lines) + 2)
info_split_filepath = tmp_dir+'/tmp_tree_split.info.'+str((i // n_lines) + 2)
with open(nwk_split_filepath, 'w') as split_out_fo:
split_out_fo.write('\n'.join(line[1] for line in split_chunk) + '\n')
with open(info_split_filepath, 'w') as split_out_fo:
split_out_fo.write('\n'.join(line[0] for line in split_chunk) + '\n')
# track the filenames
nwk_split_files.append(nwk_split_filepath)
info_split_files.append(info_split_filepath)
# using subprocess.Popen inside the run_command function,
# call MAD on all of these split newick files. Prepare the commands.
# `mad -n trees.nwk.1`
commands = [([mad_executable, '-t', '-n', split_file_path], split_file_path+'.stdout') for split_index, split_file_path in enumerate(nwk_split_files)]
mad_start_time = time.time()
# Log timestamp for the start of the MAD rooting process
print(f'MAD rooting starting at: {datetime.now()}')
# Create a multiprocessing Pool with the defined maximum number of subprocesses
with Pool(num_subprocesses) as pool:
# Use pool.map to run the commands in parallel
pool.map(run_command, commands)
# Indicate the completion of the MAD rooting process
print(f'MAD rooting completed at: {datetime.now()}. \
Total time taken for MAD rooting: {timedelta(seconds=time.time() - mad_start_time)}')
# Log that the split files are being combined, along with the timestamp
print(f'{datetime.now()}: Combining the split files into one file...')
# Note: mad adds a `.rooted` at the end of the input filename, as the output filename
# combine the columns from the info files with the nwk.rooted files to prepare the final output file
with open(input_file_path + '.rooted', 'w') as rootedOutputFO:
for i, nwk_split_filepath in enumerate(nwk_split_files):
# read in the corresponding split nwk.rooted file
rooted_nwk_split_filepath = f'{nwk_split_filepath}.rooted'
info_split_filepath = info_split_files[i]
with open(rooted_nwk_split_filepath, 'r') as rooted_nwk_split_FO:
rooted_nwk_trees = [
i.strip() for i in rooted_nwk_split_FO.readlines()
] # remove trailing newlines. This is a list of rooted trees
# combine the info and rooted trees
with open(info_split_filepath, 'r') as info_file:
rooted_info_plus_tree_chunks = [
'\t'.join([info_line.strip(), rooted_nwk_trees[line_index]])
for line_index, info_line in enumerate(info_file)
]
# write out the new tree line chunks to the output file
rootedOutputFO.write('\n'.join(rooted_info_plus_tree_chunks) + '\n')
# Clear memory
del rooted_info_plus_tree_chunks
del rooted_nwk_trees
# delete the split nwk files in the tmp folder
for split_file_name in nwk_split_files:
os.remove(split_file_name)
# combine all the log files into one log file using `tail -n +1`, and write in the input_dir with timestamp
now = datetime.now()
timestamp = now.strftime("%Y%m%d_%H%M%S")
os.system('tail -n +1 '+tmp_dir+'/*.stdout > '+input_dir+'/tmp_mad_rooting_splits_'+timestamp+'.log')
# delete the tmp folder
# os.system('rm -rf '+tmp_dir)
if __name__ == '__main__':
start_time = time.time()
# Catches and interprets the user input from the command line.
parser = argparse.ArgumentParser()
# General stuff
parser.add_argument('--Input', '-i', required=True, type=str,
help="path to input file which should be tsv format with two columns: first column is the tree ID and the second column is the tree in newick")
parser.add_argument('--mad', '-m', required=False, type=str, default='mad',
help="path to MAD executable. Assumes mad is within $PATH by default, if this argument is not provided.")
parser.add_argument('--nlines', '-n', required=False, type=int, default=50,
help="(Int) default=50. \
Number of lines that the input file will be split into for each split\
file. Smaller number implies less trees being processed by one MAD process.")
parser.add_argument('--num_subprocesses', '-p', required=False, type=int, default=50,
help="(Int) default=50. Number of processes to run in parallel. \
The larger this number, the more number of processes that will \
be run in parallel, since each split file chunk gets its own process")
# parser.add_argument('--Output', '-o', required=True, type=str,
# help="path to output files")
args = parser.parse_args()
MAD_roots_NOG_trees_parallel(args.Input, args.mad, args.nlines, args.num_subprocesses)
print('Process time taken from start to finish: ' +
str(timedelta(seconds=time.time() - start_time)) )
\ No newline at end of file
%% Cell type:markdown id: tags:
# Run the HGT-inference programs
It is advised to use each program script separately since those are very time consuming. For some, via multiprocessing, we can split up the work to reduce the running time. We are running these in an HPC with ~50 threads each.
%% Cell type:code id: tags:
``` python
taxonomicID = '1236'
data = '/root/work/projects/hgt_inference_comparative_study/data/'
```
%% Cell type:markdown id: tags:
**Note:**
For each of the scripts run below of the form `run_{inference_program}.py` (e.g. `run_ALE.py`),
you can use the `-h` flag to see the help message for the script.
For example, run `python run_ALE.py -h`.
These scripts have are run with their default parameters. You can take a look at which exact parameters or files are being used by opening the script and looking at the exact `parser.add_argument` calls. If you want to use different files, you can prepare files with the same format as those used in the scripts and pass them as arguments to the scripts.
%% Cell type:code id: tags:
``` python
%%bash
# make directories for programs
mkdir -p ALE AnGST RANGER RANGER-Fast Count GLOOME_with_tree GLOOME_without_tree
```
%% Cell type:markdown id: tags:
## Ranger
From the manual for RANGER-DTL 2.0, the program takes as input a single file (specified using the `–i` command line option) containing first the species tree, followed by a single gene tree. All input trees must be expressed using the Newick format terminated by a semicolon, and they must be fully binary (fully resolved) and rooted. Species names in the species tree must be
unique. E.g., ((speciesA, speciesB), speciesC);
Each leaf in the gene tree must be labeled with the name of the species from which that gene was sampled. If desired, the gene name can be appended to the species name separated by an underscore `_` character. The gene tree may contain any number (zero, one, or more) of homologous genes from the same species. E.g., (((speciesA_gene1, speciesC_gene1), speciesB_geneX), speciesC_gene2);
The following script (`src/run_rangerdtl.py`) runs Ranger on each of the gene trees. First it creates input files for each gene tree in our dataset by concatenating the species tree and the gene tree into one input file. Then it runs using `multiprocessing`, the program `Ranger-DTL-Fast` (from `SupplementaryPrograms/`) and `AggregateRanger` (from `CorePrograms/`) from the dir with Ranger executables. To run `AggregateRanger`, note that the output reconciliation files of Ranger-DTL-Fast needs to be of the format `{prefix}{index}` where in our case the prefix is `recon` and the indices begin with 1. The path to these output files are then provided to `AggregateRanger` by using the prefix only. See the comments in the script for more information.
Run Ranger using the script `run_rangerdtl.py` in the following manner:
```bash
# cd RANGER
$ nohup python ../src/run_rangerdtl.py > nohup_ranger.out & disown
```
### Ranger-Fast
Run Ranger-Fast using the same script but with `--fast` flag:
```bash
# cd RANGER-Fast
$ nohup python ../src/run_rangerdtl.py --fast > nohup_ranger_fast.out & disown
```
%% Cell type:markdown id: tags:
## AnGST
%% Cell type:markdown id: tags:
AnGST needs the species tree to be rooted AND to have an additional branch length for the root (prepared in the previous step's notebook; the needed species tree file is named `*_with_root_length.nwk`).
In case of any alteration of the scripts below, note that "AnGST.py" can only be run with python2.
We need an input file for "AnGST.py". The input file requires to have the path to the species tree/gene tree/output/penalties. Example:
%% Cell type:raw id: tags:
species=/path/X_species_edited_angst.nwk
gene=/path/one_unrooted_genetree.nwk
output=/path/output_result
penalties=/path/penalty.file
%% Cell type:markdown id: tags:
We create the penalty file based on the parameters suggested in the manual.
%% Cell type:code id: tags:
``` python
%%bash
# create AnGST directory if it doesn't exist
# write penalty file
# these params were taken from the AnGST manual
mkdir -p ./AnGST
cat > ./src/angst_penalty_file.txt << EOL
hgt: 3.0
dup: 2.0
los: 1.0
spc: 0.0
EOL
```
%% Cell type:markdown id: tags:
Run AnGST using the script `run_angst.py` in the following manner:
```bash
# cd AnGST
$ nohup python ../src/run_angst.py > nohup_run_angst.out & disown
```
%% Cell type:markdown id: tags:
## ALE
The script here makes use of the direct compilation of ALE (instead of using `Docker`). Run the script from a terminal, from inside the `ALE` directory inside `program runs` directory. For each gene tree, the script runs a new process where the gene tree is written to a new file, `ALEobserve` is used to create `.ale` files, and `ALEml_undated` is run to perform the reconciliation. Sometimes the ALE programs give errors for no particular reason, so the script also has a number of retries that it can perform for each of the ALE programs before giving up. Use `--help` to see the options available or the default values used.
```bash
# cd ALE
$ nohup python ../src/run_ALE.py > run_ALE_nohup.out & disown
```
%% Cell type:markdown id: tags:
## GLOOME and Count
The function below creates the presence-absence (PA) matrix for GLOOME, based on the PA of taxa in the NOGs of interest. We use that matrix (as a fasta file input) along with the species tree (and a separate run, without the tree), to infer gains and losses. Since Count also uses a PA matrix but in TSV format, we prepare it here itself. Note that Count requires each row to be that of a gene family (in our case, a NOG) whereas GLOOME needs each row (aka each sequence in the FASTA file) to be that of a taxon.
%% Cell type:code id: tags:
``` python
%%bash -s "$taxonomicID" "$data"
# write param file that uses the rooted tree
cat > ./GLOOME_with_tree/gloome_tree.params << EOL
_seqFile $2$1_pa_matrix.fasta
_treeFile $2$1_wol_tree_pruned_no_internal_labels.nwk
_gainLossDist 1
# for COG and EggNOG only patterns with 3 or more ones are observable
_minNumOfOnes 4
# include Parsimony results along with ML
_costMatrixGainLossRatio 1
## Advanced
_logValue 4
_outDir Results_GLOOME_with_tree
EOL
# write param file that doesn't use the rooted tree
cat > ./GLOOME_without_tree/gloome_without_tree.params << EOL
_seqFile $2$1_pa_matrix.fasta
_gainLossDist 1
# for COG and EggNOG only patterns with 3 or more ones are observable
_minNumOfOnes 4
# include Parsimony results along with ML
_costMatrixGainLossRatio 1
## Advanced
_logValue 4
_outDir Results_GLOOME_without_tree
EOL
```
%% Cell type:markdown id: tags:
Run Gloome via command line for each of the param files as input, inside GLOOME directory (`cd GLOOME`). Using `nohup`, `&`, and `disown` here because the process may take a while to finish, and I am running it remotely on a server. Processes shouldn't be affected this way, even if I logout.
```bash
nohup ~/bin/GLOOME.VR01.266 gloome_tree.params > nohup_gloome_run_with_tree.out & disown
nohup ~/bin/GLOOME.VR01.266 gloome_without_tree.params > nohup_gloome_run_without_tree.out & disown
```
%% Cell type:markdown id: tags:
### Run Count
%% Cell type:code id: tags:
``` python
%%bash -s "$taxonomicID" "$data"
# exit if command exits with non-zero status
set -e
# change to the directory where Count will be run
cd ./program_runs/Count/Count_MP/
# for `gain` parameter being 0.33, 0.5, 1, 2, 3, 4, 5, 6, 7, run Count
for g in 0.33 0.5 1 2 3 4 5 6 7 8;
do
# run Count
java -Xmx2048M -cp ~/bin/Count/Count.jar ca.umontreal.iro.evolution.genecontent.AsymmetricWagner -gain $g $2$1_wol_tree_pruned_with_internal_labels.nwk $2$1_pa_matrix.tsv > ${1}_Count_output_gain_${g}.tsv && echo "Count with gain ratio $g done" &&
# grep the output
grep "# PRESENT" ${1}_Count_output_gain_${g}.tsv > ${1}_Count_output_gain_${g}_genome_sizes.tsv &&
grep "# CHANGE" ${1}_Count_output_gain_${g}.tsv > ${1}_Count_output_gain_${g}_changes.tsv &&
grep "# FAMILY" ${1}_Count_output_gain_${g}.tsv > ${1}_Count_output_gain_${g}_families.tsv &&
echo "Count output for gain ratio $g grepped"
done
```
%% Output
Count with gain ratio 0.33 done
Count output for gain ratio 0.33 grepped
Count with gain ratio 0.5 done
Count output for gain ratio 0.5 grepped
Count with gain ratio 1 done
Count output for gain ratio 1 grepped
Count with gain ratio 2 done
Count output for gain ratio 2 grepped
Count with gain ratio 3 done
Count output for gain ratio 3 grepped
Count with gain ratio 4 done
Count output for gain ratio 4 grepped
Count with gain ratio 5 done
Count output for gain ratio 5 grepped
Count with gain ratio 6 done
Count output for gain ratio 6 grepped
Count with gain ratio 7 done
Count output for gain ratio 7 grepped
Count with gain ratio 8 done
Count output for gain ratio 8 grepped
%% Cell type:markdown id: tags:
## Wn
%% Cell type:markdown id: tags:
Wn can be run directly using the script `run_Wn.py` in the `src` directory:
```bash
cd 02-run_programs/program_runs/Wn/
nohup ~/mambaforge/envs/hgt_analyses/bin/python ../../../02-run_programs/src/run_Wn.py > nohup_run_Wn.log & disown
```
The results will be saved by default in the `Results` directory.
../data/program_runs
\ No newline at end of file
''' This script does all the functional category analysis. It includes the calculation of absolute and relative transferability, fisher exact test on relative transferability, Kruskal-Wallis test on absolute transferability and Mann-Whitney test on meta categories. '''
#list of python libraries
import os
import ete3
import argparse
import statistics
import numpy as np
import pandas as pd
import seaborn as sns
from scipy import stats
import scipy.stats as stats
parser = argparse.ArgumentParser()
parser.add_argument('-m',metavar='memberfile',type=str,help='eggNOG memberfile')
parser.add_argument('-a',metavar='annotations',type=str,help='eggNOG annotations')
parser.add_argument('-i',metavar='HGT file',type=str,help='HGT outfile. Each line should contain only one putative HGT candidate.')
parser.add_argument('-g',metavar='Input type',type=int,help='HGT input type. Wn_threshold returns a file with putative HGT candidates, but with specific gene IDs, while phylogenetic programs return either pairwise recipient donor pairs or only recipients, both as taxonomic IDs. Specify the input as 0 -> gene, 1 -> taxonomix ID')
parser.add_argument('-o',metavar='outfile name',type=str,help='Outfile name with pathing.')
parser.add_argument('-f',metavar='BH correction, false positive rate',type=int,default=50,help='BH correction parameter to set the ratio of false positives.')
args = parser.parse_args()
if not args.i:
parser.error("No HGT input given. Please check your inputs again.")
exit
if not args.o:
parser.error("No outfil name. Please check your inputs again.")
exit
if not args.m:
parser.error("No memberfile to be found. Please check your inputs again.")
exit
if not args.a:
parser.error("No annotation file to be found. Please check your inputs again.")
exit
#############################################################################
NOG2IDs = dict() # NOG to taxonomic IDs
NOG2HGT = dict() # NOG to the amount of HGT
prot2NOGs = dict() #protein acession ID to its NOG
ID2NOGs = dict()
NOG2FC = dict() # NOG to its functional category
FC2HGT= dict() #Functional category to a list of proteins belonging to the functional category
cats = ["C","G","E","F","H","I","P","Q","V","D","T","M","N","U","O","J","K","L"]
cat_to_total = dict() # counts the amount of NOGs of one functional category
cat_to_transfers = dict() # counts the amount of transfers/gains of one functional category
cat_to_transferable = dict() # counts the amount of transferable NOGs of one functional category
cat_to_description = {
"C":"Energy production and conversion",
"G":"Carbohydrate transport and metabolism",
"E":"Amino acid transport and metabolism",
"F":"Nucleotide transport and metabolism",
"H":"Coenzyme transport and metabolism",
"I":"Lipid transport and metabolism",
"P":"Inorganic ion transport and metabolism",
"Q":"Secondary metabolites biosynthesis, transport and catabolism",
"V":"Defense mechanisms",
"D":"Cell cycle control, cell division, chromosome partitioning",
"T":"Signal transduction mechanisms",
"M":"Cell wall/membrane/envelope biogenesis",
"N":"Cell motility",
"U":"Intracellular tracking, secretion, and vesicular transport",
"O":"Post-translational modi cation, protein turnover, chaperones",
"J":"Translation, ribosomal structure and biogenesis",
"K":"Transcription",
"L":"Replication, recombination and repair"
}
meta2tottrans = dict()
meta_to_total = dict() ## counts the amount of gene families of one meta category
meta_to_transfers = dict() ## counts the amount of transfers/gains of one meta category
meta_to_transferable = dict() ## counts the amount of transferable gene families of one meta category
FC2tottrans = dict() #FC to all transfers in a list; each element of the list contains the amount of transfers of one NOG; needed for the Kruskal-Wallis test
meta2tottrans = dict() #same as above, only with meta categories
## Declare a dictionary where each functional category points to their corresponding meta category.
## A -> metabolic function, B -> cellular processes/signaling, C -> informational
cat_to_meta = {
"C":"A",
"G":"A",
"E":"A",
"F":"A",
"H":"A",
"I":"A",
"P":"A",
"Q":"A",
"V":"B",
"D":"B",
"T":"B",
"M":"B",
"N":"B",
"U":"B",
"O":"B",
"J":"C",
"K":"C",
"L":"C"
}
# BH-correction function
def BH(pvalues):
pvalues = dict(sorted(pvalues.items(), key=lambda x:x[1]))
FCs = list(pvalues.keys())
critical = 0
pvalue2significant = dict()
for i in range(len(FCs)):
pvalue = pvalues[FCs[i]]
correction = (i+1/len(FCs))*(args.f/100)
if pvalue < correction:
if pvalue > critical:
critical = pvalue
pvalue2significant[FCs[i]] = "no"
for i in range(len(FCs)):
if pvalues[FCs[i]] < critical:
pvalue2significant[FCs[i]] = "yes"
return pvalue2significant
################################################################################
for line in open(args.m):
line = line.rstrip()
line = line.split("\t")
NOG = line[1]
protnames = line[4].split(",") # list containing all protein IDs
taxIDs = line[5].split(",") # list containing all taxonomic IDs
NOG2IDs[NOG] = taxIDs
NOG2HGT[NOG] = list()
for name in protnames:
if name not in prot2NOGs.keys():
prot2NOGs[name] = NOG
else:
prot2NOGs[name] += ","+NOG
for ID in taxIDs:
if ID not in ID2NOGs.keys():
ID2NOGs[ID] = NOG
else:
ID2NOGs[ID] += ","+NOG
for line in open(args.a):
line = line.rstrip()
line = line.split("\t")
NOG = line[1]
FC = line[2]
NOG2FC[NOG] = FC
if FC not in FC2HGTkeys():
FC2HGT[FC] = list()
# depending on the HGT input (either gene IDs or taxonomic IDs), the general procedure differs by a bit.
if args.g == 0: # if the HGT input contains gene IDs
for line in open(args.i):
HGTs = line.rstrip().split(",")
for HGT in HGTs:
if HGT in prot2NOGs.keys():
for NOG in prot2NOGs[HGT].split(","):
NOG2HGT[NOG].append(HGT)
else: # if the HGT input contains taxnomic IDs
for line in open(args.i):
HGTs = line.rstrip().split(",")
for HGT in HGTs:
if HGT in ID2NOGs.keys():
for NOG in ID2NOGs[HGT].split(","):
NOG2HGT[NOG].append(HGT)
#########################################################################################################
# Now sort the NOGs into their functional category. After that, count the amount of HGT per functional category/meta category
for NOG in NOG2HGT:
if NOG in NOG2FC.keys():
HGTs = NOG2HGT[NOG]
FC = NOG2FC[NOG]
for HGT in HGTs:
FC2HGT[FC].append(HGT)
''' calculating the relative and absolute transferability for all functional and meta categories '''
#### All FC
rates_FC = open(args.o+"_rates_FC.txt","w") #this is the outfile
for cat in cats: ## We are declaring every key-value pair in the three dictionaries above. Key will be the functional category and the value will be 0.
cat_to_total[cat] = 0
cat_to_transfers[cat] = 0
cat_to_transferable[cat] = 0
for NOG in NOG2FC:
NOG = NOG
cat = NOG2FC[NOG]
trans = len(NOG2HGT[NOG])
for i in range(len(cat)): ## There might be gene families with more than one functional category, so we iterate through the whole length of "cat".
onecat = cat[i]
if onecat in cats: ## Check if this functional category is even in the list of functional categories we are looking for
cat_to_total[onecat] += 1
cat_to_transfers[onecat] += trans
if cat not in FC2tottrans.keys():
FC2tottrans[cat] = list()
FC2tottrans[cat].append(trans)
else:
FC2tottrans[cat].append(trans)
if trans > 1:
cat_to_transferable[onecat] += 1
## Now calculate all rates.
sum_total = sum(list(cat_to_total[FC] for FC in cat_to_total.keys()))
sum_transferables = sum(list(cat_to_transferable[FC] for FC in cat_to_transferable.keys()))
rates_FC.write("#FC\tabtrans\treltrans\n")
for cat in cats:
transfers = cat_to_transfers[cat]
total = cat_to_total[cat]
transferable = cat_to_transferable[cat]
nontransferable = total - transferable
abtrans = transfers/total
rate = transferable/total
reltrans = rate/(sum_transferables/sum_total)
rates_FC.write(cat+"\t"+str(abtrans)+"\t"+str(reltrans)+"\n")
rates_FC.close()
#### Meta categories
rates_meta = open(args.o+"_rates_meta.txt","w")
#later needed for Fisher exact test
total_transfers = 0
total_prots = 0
total_transferable = 0
for meta in ["A","B","C"]: ## We are declaring every key-value pair in the three dictionaries above. Key will be the meta category and the value will be 0.
meta2tottrans[meta] = list()
meta_to_total[meta] = 0
meta_to_transfers[meta] = 0
meta_to_transferable[meta] = 0
for NOG in NOG2FC:
NOG = NOG
cat = NOG2FC[NOG]
trans = len(NOG2HGT[NOG])
for i in range(len(cat)): ## There might be gene families with more than one functional category, so we iterate through the whole length of "cat".
onecat = cat[i]
if onecat in cats: ## Check if this functional category is even in the list of functional categories we are looking for
## get the meta category of the current functional category
meta = cat_to_meta[onecat]
meta_to_total[meta] += 1
total_prots += 1 #Fisher
meta_to_transfers[meta] += trans
total_transfers += trans #Fisher
meta2tottrans[meta].append(trans)
if float(trans) > 2:
meta_to_transferable[meta] += 1
total_transferable += 1 #Fisher
## Now calculate all transferability values.
sum_total = sum(list(meta_to_total[meta] for meta in meta_to_total.keys()))
sum_transferables = sum(list(meta_to_transferable[meta] for meta in meta_to_transferable.keys()))
rates_meta.write("#meta\tabtrans\treltrans\n")
category_name = {
"A":"metabolic function",
"B":"cellular processes/signalling",
"C":"informational"
}
for meta in ["A","B","C"]:
transfers = meta_to_transfers[meta]
total = meta_to_total[meta]
transferable = meta_to_transferable[meta]
nontransferable = total - transferable
abtrans_meta = transfers/total
rate_meta = transferable/total
reltrans_meta = rate_meta/(sum_transferables/sum_total)
rates_meta.write(meta+"\t"+str(abtrans_meta)+"\t"+str(reltrans_meta)+"\n")
rates_meta.close()
''' Fisher exact test on relative transferability '''
#For all the FC we have the three dictionaries to use:
#cat_to_total
#cat_to_transfers
#cat_to_transferable
fisher_outfile = open(args.o+"_fisher_allFC.txt","w")
total_transferable_FC = sum(list(cat_to_transferable[FC] for FC in cat_to_transferable.keys()))
pvalues = dict()
oddsratios = dict()
for FC in cat_to_transferable.keys():
transferable = cat_to_transferable[FC]
nontransferable = cat_to_total[FC] - transferable
cont_transferable = total_transferable_FC - transferable
cont_nontransferable = sum(list(cat_to_total[FC] for FC in cat_to_total.keys())) - total_transferable_FC - nontransferable
data = [[transferable,nontransferable],[cont_transferable,cont_nontransferable]]
df = pd.DataFrame(data)
oddsratio,pvalue = stats.fisher_exact(df)
pvalues[FC] = pvalue
oddsratios[FC] = oddsratio
BH_pvalues = BH(pvalues)
for FC in cats:
pvalue = pvalues[FC]
significant = BH_pvalues[FC]
oddsratio = oddsratios[FC]
if significant == "yes":
fisher_outfile.write(cat_to_description[FC]+" ("+FC+")\t"+str(pvalue)+"\t"+str(oddsratio)+"\t*\n")
else:
fisher_outfile.write(FC+"\t"+str(pvalue)+"\t"+str(oddsratio)+"\t \n")
fisher_outfile.close()
#For the meta categories we have the three dictionaries to use:
#meta_to_total
#meta_to_transfers
#meta_to_transferable
pvalues = dict()
oddsratios = dict()
fisher_outfile = open(args.o+"_fisher_allmeta.txt","w")
total_transferable_meta = sum(list(meta_to_transferable[meta] for meta in ["A","B","C"]))
for meta in ["A","B","C"]:
transferable = meta_to_transferable[meta]
nontransferable = meta_to_total[meta] - transferable
cont_transferable = total_transferable_meta - transferable #this is the total amount of transfers across all meta categories excluding the current meta cat
cont_nontransferable = sum(list(meta_to_total[meta] for meta in meta_to_total.keys())) - total_transferable_meta - nontransferable #this is the total amount of non-transfers across all meta categories excluding the current meta cat
#print(transferable,nontransferable,cont_transferable,cont_nontransferable)
data = [[transferable,nontransferable],[cont_transferable,cont_nontransferable]]
df = pd.DataFrame(data)
oddsratio, pvalue = stats.fisher_exact(df)
pvalues[meta] = pvalue
oddsratios[meta] = oddsratio
BH_pvalues = BH(pvalues)
for meta in ["A","B","C"]:
pvalue = pvalues[meta]
significant = BH_pvalues[meta]
oddsratio = oddsratios[meta]
if significant == "yes":
fisher_outfile.write(category_name[meta]+" ("+meta+")\t"+str(pvalue)+"\t"+str(oddsratio)+"\t*\n")
else:
fisher_outfile.write(meta+"\t"+str(pvalue)+"\t"+str(oddsratio)+"\t \n")
fisher_outfile.close()
''' Kruskal-Wallis-test on absolute transferability '''
kruskal_outfile = open(args.o+"_kruskal_results.txt","w")
#### for all FC
K = list()
for FC in FC2tottrans.keys():
K.append(FC2tottrans[FC])
kruskal_outfile.write(str(stats.kruskal(K[0],K[1],K[2],K[3],K[4],K[5],K[6],K[7],K[8],K[9],K[10],K[11],K[12],K[13],K[14],K[15],K[16],K[17]))+"\n")
kruskal_outfile.close()
''' Mann-Whitney test on meta categories '''
MW_outfile = open(args.o+"_mannwhitney_results.txt","w")
M = list()
for meta in meta2tottrans:
transf=meta2tottrans[meta]
transf=[float(transf[j]) for j in range(1,len(transf))]
M.append(transf)
AB = stats.mannwhitneyu(M[0],M[1])
AC = stats.mannwhitneyu(M[0],M[2])
BC = stats.mannwhitneyu(M[1],M[2])
MW_outfile.write(str(AB)+"\n")
MW_outfile.write(str(AC)+"\n")
MW_outfile.write(str(BC)+"\n")
MW_outfile.close()
hgt: 3.0
dup: 2.0
los: 1.0
spc: 0.0
import os
import subprocess
from multiprocessing import Pool
from loguru import logger
import glob
import shutil
import time
def run_ALE_on_NOG(NOG_ID, species_tree_filepath, gene_trees_filepath, bin_dir, n_samples, separator, max_retries):
"""Run ALE on a single NOG_ID
Args:
NOG_ID (str): NOG ID to run ALE on
species_tree_filepath (str): Path to the species tree file
gene_trees_filepath (str): Path to the gene trees file
bin_dir (str): Path to the ALE bin directory
n_samples (int): Number of reconciliation samples for ALEml_undated
separator (str): Separator for ALEml_undated
Returns:
NOG_ID (str): NOG ID that was run, if successful
If unsuccessful, returns None
"""
base_dir = os.getcwd() # this is the directory where the script is run from
genetree_filepath = os.path.join(base_dir, f"{NOG_ID}.tree")
# write this specific genetree to a file by grepping for the NOG_ID
# and then writing the second column to a file
try:
with open(genetree_filepath, 'w') as tree_file_object:
p1 = subprocess.Popen(
["grep", NOG_ID, gene_trees_filepath], stdout=subprocess.PIPE)
p2 = subprocess.Popen(
["awk", "{print $2}"], stdin=p1.stdout, stdout=tree_file_object)
p2.communicate()
if p1.poll() is None:
p1.communicate()
logger.info(f"Successfully wrote gene tree {
NOG_ID} to file {genetree_filepath}")
except subprocess.CalledProcessError as e:
logger.error(f"Failed to write gene tree to file with error {e}")
return None
# run ALE on the genetree
# first run ALEobserve to create the .ale file
n_sleep = 2 # sleep for n_sleep seconds before retrying
for i in range(max_retries + 1):
try:
subprocess.run(
[f"{bin_dir}ALEobserve", genetree_filepath], check=True)
break
except subprocess.CalledProcessError as e:
if i < max_retries: # i is zero indexed
logger.warning(f"ALEobserve failed with error {
e}, retrying ({i+1}/{max_retries})")
# sleep for n_sleep seconds before retrying
time.sleep(n_sleep)
continue
else:
logger.error(f"ALEobserve failed with error {
e}, no more retries left")
return None
# then run ALEml_undated with retry mechanism
for i in range(max_retries + 1):
try:
aleml_undated_command = [f"{bin_dir}ALEml_undated", species_tree_filepath, f"{
genetree_filepath}.ale", f'separators="{separator}"', f"sample={n_samples}"]
logger.info(f"Running ALEml_undated with command: {
aleml_undated_command}")
subprocess.run(aleml_undated_command, check=True)
return NOG_ID # If ALEml_undated succeeds, return the NOG_ID
except subprocess.CalledProcessError as e:
if i < max_retries: # i is zero indexed
logger.warning(f"ALEml_undated failed with error {
e}, retrying ({i+1}/{max_retries})")
time.sleep(n_sleep)
continue
else:
logger.error(f"ALEml_undated failed with error {
e}, no more retries left")
return None
return NOG_ID
def run_ALE_on_NOG_wrapper(args):
return run_ALE_on_NOG(*args)
if __name__ == '__main__':
# calculate total time taken
from datetime import timedelta
import time
start_time = time.time()
# log using start time
logger.info("Starting ALE run")
# write everything in terms of argparse instead of hardcoding
import argparse
parser = argparse.ArgumentParser(description="Run ALE on all gene trees")
parser.add_argument("--species", "-s", type=str, default="../../data/1236_wol_tree_pruned_no_internal_labels.nwk",
help="Path to species tree file (default: ../../data/1236_wol_tree_pruned_no_internal_labels.nwk)")
parser.add_argument("--gene", "-g", type=str, default="../../data/1236_pruned_gene_trees.tsv.rooted",
help="Path to gene trees file (default: ../../data/1236_pruned_gene_trees.tsv.rooted)")
parser.add_argument("--threads", "-t", type=int, default=50,
help="Number of threads to use for parallelization (default: 50)")
parser.add_argument("--bin", "-b", type=str,
default="/root/bin/ALE/bin/", help="Path to ALE bin directory (default: /root/bin/ALE/bin/)")
parser.add_argument("--samples", "-n", type=int, default=100,
help="Number of reconciliation samples for ALEml_undated (default: 100)")
parser.add_argument("--separator", "-sep", type=str, default=".",
help="Separator for ALEml_undated (default: '.')")
parser.add_argument("--debug", "-d", action="store_true",
help="Enable debug mode (run ALE on first 5 input trees only)")
parser.add_argument("--output-dir", "-o", type=str, default="./Results/",
help="Path to output directory (default: ./Results/)")
parser.add_argument("--retries", "-r", type=int, default=10,
help="Max number of retries for ALEml_undated and ALEobserve (default: 10)")
args = parser.parse_args()
# parse the arguments
species_tree_filepath = args.species
gene_trees_filepath = args.gene
max_threads = args.threads
bin_dir = args.bin
n_samples = args.samples
separator = args.separator
output_dir = args.output_dir
max_retries = args.retries
# log the arguments
logger.info(f"\nArguments passed: \
\nSpecies tree file: {species_tree_filepath} \
\nGene trees file: {gene_trees_filepath} \
\nNumber of threads: {max_threads} \
\nALE bin directory: {bin_dir} \
\nNumber of samples: {n_samples} \
\nSeparator: {separator} \
\nOutput directory: {output_dir} \
\nNumber of retries: {max_retries}")
# read and extract first column as list of NOG_IDs
with open(gene_trees_filepath, "r") as treeIDs:
NOG_ID_list = [line.rstrip().split("\t")[0]
for line in treeIDs.readlines()]
# if debug
if args.debug:
NOG_ID_list = NOG_ID_list[:5]
logger.warning(
"Debug mode enabled. Running ALE on first 5 gene trees only")
max_threads = 5
# log the number of NOGs to run ALE on, with time
logger.info(f"ALE will be run on {(len(NOG_ID_list))} NOGs, using {
str(max_threads)} threads")
with Pool(processes=max_threads) as pool: # run ALE on each NOG in parallel
for pool_out in pool.imap(run_ALE_on_NOG_wrapper, [(NOG_ID, species_tree_filepath, gene_trees_filepath, bin_dir, n_samples, separator, max_retries) for NOG_ID in NOG_ID_list]):
if pool_out is not None: # if ALE run was successful
# log time when you finished this NOG_ID
print(f"{time.strftime('%X %x %Z')}: Finished {pool_out}\n")
# move all files that end with `ale.uTs` to the outputs directory
# check if the outputs directory exists, if not create it. If creating new, raise warning
if os.path.exists(output_dir):
logger.warning(
"The outputs directory already exists. Overwriting files in the outputs directory")
# remove all files in the outputs directory
for file in glob.glob(f"./{output_dir}/*"):
os.remove(file)
else:
os.makedirs(output_dir)
# remove all `*.ale.uml_rec` files
# for file in glob.glob(f"./*.ale.uml_rec"):
# os.remove(file)
# or move them to the outputs directory
for file in glob.glob(f"./*.ale.uml_rec"):
shutil.move(file, output_dir)
# get the filename of the species tree
species_tree_filename = os.path.basename(species_tree_filepath)
for file in glob.glob(f"./*.ale.uTs"):
shutil.move(file, output_dir)
# remove all files that end with .ale
for file in glob.glob(f"./*.ale"):
os.remove(file)
# move all files that end with .tree to the trees directory
# create the directory if it doesn't exist
os.makedirs("./tmp_trees/", exist_ok=True)
for file in glob.glob(f"./*.tree"):
shutil.move(file, "./tmp_trees/")
# calculate total time taken
end_time = time.time()
total_time = end_time - start_time
# log the time taken in a readable format
logger.info(f"Total time taken: {timedelta(seconds=total_time)}")
This diff is collapsed.
# Run AnGST with multiprocessing. This script requires python2 to run AnGST.
import os
import subprocess
from multiprocessing import Pool
import time
import logging
import shutil
from datetime import timedelta
from ete3 import Tree
# Create or get the logger
logger = logging.getLogger(__name__)
# Configure logging
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s | %(levelname)s | %(message)s',
datefmt='%Y-%m-%d %H:%M:%S'
)
def run_ANGST_on_NOG(pool_args):
"""
This function is used to run AnGST in a multithreaded procedure.
For each thread/worker process, the function is called with a different NOG_ID.
Args:
pool_args (tuple): Tuple containing the following elements:
nog_id (str): NOG ID to run AnGST on
species_tree (str): Path to species tree file
gene_tree_filepath (str): Path to gene tree file
output_dir (str): Path to output directory
penalty_file (str): Path to penalty file
Returns:
nog_id (str): NOG ID that was run, if successful
If unsuccessful, returns None
"""
nog_id, species_tree, gene_tree_filepath, output_dir, penalty_file, bin_dir, tmp_dir = pool_args
# create input file for AnGST in tmp folder
input_path = os.path.join(tmp_dir, "{}.input".format(nog_id))
angst_script = "{}/angst_lib/AnGST.py".format(bin_dir)
# output directory for AnGST results for this NOG
output_dir = os.path.join(output_dir, nog_id)
input_content = [
"species={}\n".format(species_tree),
"gene={}\n".format(gene_tree_filepath),
"output={}/\n".format(output_dir),
"penalties={}".format(penalty_file)
]
# write input file
with open(input_path, "w") as infile:
infile.writelines(input_content)
# Run AnGST
try:
logger.info("Running AnGST for NOG_ID {}".format(nog_id))
subprocess.check_call(["python2", angst_script, input_path])
except subprocess.CalledProcessError as e:
logger.error(
"AnGST failed for NOG_ID {} with error\n {}".format(nog_id, e))
return None
return nog_id
def unroot(newick_string, outfile_path):
"""
This function takes a newick string and writes it to a file after unrooting it.
Args:
newick_string (str): Newick string to be written to file
outfile_path (str): Path to output file
Returns:
None
"""
# read in the newick string
t = Tree(newick_string, format=1)
# unroot the tree
t.unroot()
# write the tree to file
t.write(outfile=outfile_path, format=1)
logger.info("Unrooted gene tree and wrote it to {}".format(outfile_path))
return None
if __name__ == '__main__':
import argparse
# track time
start_time = time.time()
logger.info("Started running {} at {}".format(__file__, time.asctime()))
# use argparse for reading in the number of threads and the input trees file
parser = argparse.ArgumentParser(
description="Run AnGST on a set of gene trees")
parser.add_argument("--species", "-s", type=str, default="../../data/1236_wol_tree_pruned_angst.nwk",
help="Path to species tree file (default: ../../data/1236_wol_tree_pruned_angst.nwk)")
parser.add_argument("--gene", "-g", type=str, default="../../data/1236_pruned_gene_trees.tsv",
help="Path to gene trees file (default: ../../data/1236_pruned_gene_trees.tsv)")
parser.add_argument("--threads", "-t", type=int, default=50,
help="Number of threads to use for parallelization (default: 50)")
parser.add_argument("--output", "-o", type=str, default="./Results",
help="Output directory for AnGST results (default: ./Results)")
parser.add_argument("--penalties", "-p", type=str, default="../src/angst_penalty_file.txt",
help="Path to penalty file for AnGST (default: ../src/angst_penalty_file.txt)")
parser.add_argument("--bin", "-b", type=str, default="/root/bin/angst/",
help="Path to AnGST bin directory (default: /root/bin/angst/)")
# parse args
args = parser.parse_args()
species_tree = args.species
gene_trees = args.gene
max_threads = args.threads
output_dir = args.output
penalty_file = args.penalties
bin_dir = args.bin
# log arguments
logger.info("Running {} with arguments: {}".format(__file__, args))
# convert all the paths to absolute paths
species_tree = os.path.abspath(species_tree)
gene_trees = os.path.abspath(gene_trees)
output_dir = os.path.abspath(output_dir)
penalty_file = os.path.abspath(penalty_file)
bin_dir = os.path.abspath(bin_dir)
# create the output directory if it does not exist
if not os.path.exists(output_dir):
os.makedirs(output_dir)
logger.info("Created output directory {}".format(output_dir))
else:
logger.info("Output directory {} already exists".format(output_dir))
# delete the output dir and recreate it
shutil.rmtree(output_dir)
os.makedirs(output_dir)
# create tmp directory for gene trees and input files, with a timestamp, with abs path
timestamp = time.strftime("%Y%m%d%H%M%S")
tmp_dir = os.path.join(os.getcwd(), "tmp_{}".format(timestamp))
if not os.path.exists(tmp_dir):
os.makedirs(tmp_dir)
logger.info("Created temporary directory {}".format(tmp_dir))
# Read in the gene trees and create a list of eggNOG IDs
# additionally, write out the species tree and each gene tree to a file as first and second lines,
# with the ID of gene tree as the filename, in the tmp folder
if not os.path.exists(gene_trees):
logger.error("Gene trees file {} does not exist".format(gene_trees))
raise IOError("Gene trees file {} does not exist".format(gene_trees))
else:
logger.info("Reading gene trees from {}".format(gene_trees))
with open(gene_trees, "r") as gene_trees_fo:
gene_trees = gene_trees_fo.readlines()
nog_id_list = [tree.split("\t")[0] for tree in gene_trees]
# Debugging: run AnGST on last n_debug NOGs only
# n_debug=2; nog_id_list = nog_id_list[-n_debug:]; max_threads=n_debug; gene_trees=gene_trees[-n_debug:]
# unroot the gene trees and write them to file
for gene_tree in gene_trees:
nog_id, nog_newick = gene_tree.split("\t")
# first read in the tree using ete3 and then write it out after unrooting it
try:
unroot(nog_newick, "{}/{}.nwk".format(tmp_dir, nog_id))
except Exception as e:
logger.error(
"Error unrooting gene tree for NOG_ID {}\n{}".format(nog_id, e))
continue
# prepare list of tuples to be fed into each process for parallelization
pool_args = [(nog_id, species_tree, "{}/{}.nwk".format(tmp_dir, nog_id),
output_dir, penalty_file, bin_dir, tmp_dir) for nog_id in nog_id_list]
# create the pool. Note that this is python2, so we can't use `with Pool() as pool:`
pool = Pool(processes=max_threads)
# use the pool
for pool_out in pool.imap(run_ANGST_on_NOG, pool_args):
if pool_out is not None:
logger.info(
"AnGST run for {} completed successfully".format(pool_out))
# close and join the pool
pool.close()
pool.join()
# Optional: remove the temporary directory
# shutil.rmtree(tmp_dir); logger.info("Removed temporary directory {}".format(tmp_dir))
# track time
end_time = time.time()
elapsed_time = end_time - start_time
logger.info("Finished running {} at {}, took {}".format(
__file__, time.asctime(), str(timedelta(seconds=elapsed_time))))
import subprocess
import os
from multiprocessing import Pool
from loguru import logger
import time
import shutil
from datetime import timedelta
def run_RANGERDTL_on_NOG(NOG_ID, tmp_dir, bin_dir, n_runs, ranger_bin):
"""
This function is used to run RangerDTL in a multithreaded procedure.
For each thread/worker process, the function is called with a different NOG_ID.
Args:
NOG_ID (str): NOG ID to run RangerDTL on
tmp_dir (str): path to temporary directory to store input files
bin_dir (str): path to Ranger bin directory
n_runs (int): number of runs to perform
Returns:
NOG_ID (str): NOG ID that was run, if successful
If unsuccessful, returns None
The output of RangerDTL's AggregateRanger is a file with the same name as the input file, but with the ending "_aggregateoutput.txt".
This file contains the reconciliation results of n_runs runs of RangerDTL.
All the results are stored in the folder "Ranger/Results". The input files are stored in "Ranger/tmp_{NOG_ID}_results".
The input files are (optionally) deleted after the program has been run.
"""
# Make a folder for the results of each tree
os.makedirs(f"./Results/tmp_{NOG_ID}_results", exist_ok=True)
# use the NOG_ID to define the filename of the input file
input_file = f"{tmp_dir}/{NOG_ID}.input"
# Run Ranger to perform n_runs reconciliations
for i in range(1, n_runs+1):
# Run RangerDTL. Output files need to have prefix "recon{index}", with index starting at 1.
try:
subprocess.run(f"{bin_dir}/{ranger_bin} -i {input_file} -o ./Results/tmp_{
NOG_ID}_results/recon{i} >> ./Results/tmp_{NOG_ID}_results/RANGERDTL.log", shell=True)
# subprocess.run(f"{bin_dir}/SupplementaryPrograms/Ranger-DTL-Fast.linux -i {input_file} -o ./Results/tmp_{
# NOG_ID}_results/recon{i} >> ./Results/tmp_{NOG_ID}_results/RANGERDTL.log", shell=True)
# subprocess.run(f"{bin_dir}/CorePrograms/Ranger-DTL.linux -i {input_file} -o ./Results/tmp_{
# NOG_ID}_results/recon{i} >> ./Results/tmp_{NOG_ID}_results/RANGERDTL.log", shell=True)
except subprocess.CalledProcessError as e:
logger.error(f"RangerDTL failed for NOG_ID {
NOG_ID} with error\n {e}")
return None
logger.info(f"Running RangerDTL finished for NOG_ID {NOG_ID}")
# Aggregate the results of the runs into one file.
try:
subprocess.run(f"{bin_dir}/CorePrograms/AggregateRanger.linux ./Results/tmp_{
NOG_ID}_results/recon > ./Results/{NOG_ID}_aggregateoutput.txt", shell=True)
logger.info(f"Aggregating results finished for NOG_ID {NOG_ID}")
except subprocess.CalledProcessError as e:
logger.error(f"Aggregating RangerDTL results failed for NOG_ID {
NOG_ID} with error\n {e}")
return None
# Optional: Delete the input file because it is not needed anymore
if os.path.exists(input_file):
os.remove(input_file)
logger.info(f"Deleted input file {input_file}")
# Optional: remove tmp_{NOG_ID}_results folder
# shutil.rmtree(f"./Results/tmp_{NOG_ID}_results")
return NOG_ID
def run_RANGERDTL_on_NOG_wrapper(args):
"""
Wrapper function for running RangerDTL on a single NOG_ID
Args:
args (tuple): Tuple containing (NOG_ID, species_tree_filepath, gene_trees_filepath, bin_dir)
Returns:
NOG_ID (str): NOG ID that was run, if successful
If unsuccessful, returns None
"""
return run_RANGERDTL_on_NOG(*args)
if __name__ == '__main__':
# log time
logger.info(f"Started running {__file__} at {time.asctime()}")
start_time = time.time()
# import argparse and read in the arguments
import argparse
parser = argparse.ArgumentParser()
parser.add_argument("--speciestree", "-s", help="Path to the species tree (default: ../../data/1236_wol_tree_pruned_with_internal_labels.nwk)",
type=str, default="../../data/1236_wol_tree_pruned_with_internal_labels.nwk")
parser.add_argument("--genetrees", "-g", help="Path to the gene trees (default: ../../data/1236_pruned_gene_trees.tsv.rooted.underscored)", type=str,
default="../../data/1236_pruned_gene_trees.tsv.rooted.underscored")
parser.add_argument("--threads", "-t",
help="Number of threads to use (default: 50)", type=int, default=50)
parser.add_argument("--bin", "-b", help="Path to the Ranger bin directory (default: /root/bin/RANGERDTL_LINUX/)",
type=str, default="/root/bin/RANGERDTL_LINUX/")
parser.add_argument(
"--runs", "-r", help="Number of runs to perform (default: 100)", type=int, default=100)
parser.add_argument(
"--fast", "-f", help="Use RANGER-DTL-Fast instead of RANGER-DTL (default: False)", default=False, action="store_true")
args = parser.parse_args()
# log arguments
logger.info(f"Running {__file__} with arguments: {args}")
# Read in the arguments
species_tree_filepath = args.speciestree
gene_trees_filepath = args.genetrees
max_threads = args.threads
bin_dir = args.bin
n_runs = args.runs
if args.fast:
ranger_bin = "SupplementaryPrograms/Ranger-DTL-Fast.linux"
else:
ranger_bin = "CorePrograms/Ranger-DTL.linux"
# create temporary dir for the input files to RANGER, using tempfile with date-time stamp
timestamp = time.strftime("%Y%m%d%H%M%S")
tmp_dir = f"./tmp_ranger_{timestamp}"
os.makedirs(tmp_dir, exist_ok=True)
# Read in the newick string of the species tree
if not os.path.exists(species_tree_filepath):
logger.error(f"Species tree file {
species_tree_filepath} does not exist")
raise FileNotFoundError(f"Species tree file {
species_tree_filepath} does not exist")
else:
logger.info(f"Reading species tree from {species_tree_filepath}")
with open(species_tree_filepath, "r") as species_tree:
species_tree_newick = species_tree.read()
# make sure the newick string ends with a newline character. If not, add it.
if not species_tree_newick.endswith("\n"):
species_tree_newick += "\n"
# Read in the gene trees and create a list of eggNOG IDs
# additionally, write out the species tree and each gene tree to a file as first and second lines,
# with the ID of gene tree as the filename, in the tmp folder
if not os.path.exists(gene_trees_filepath):
logger.error(f"Gene trees file {gene_trees_filepath} does not exist")
raise FileNotFoundError(
f"Gene trees file {gene_trees_filepath} does not exist")
else:
logger.info(f"Reading gene trees from {gene_trees_filepath}")
with open(gene_trees_filepath, "r") as gene_trees_fo:
gene_trees = gene_trees_fo.readlines()
NOG_ID_list = [tree.split("\t")[0] for tree in gene_trees]
for gene_tree in gene_trees:
nog_id, nog_newick = gene_tree.split("\t")
# add newline character to newick string if it doesn't end with it
if not nog_newick.endswith("\n"):
nog_newick += "\n"
with open(f"{tmp_dir}/{nog_id}.input", "w") as gene_tree_fo:
gene_tree_fo.write(species_tree_newick)
gene_tree_fo.write(nog_newick)
# Debugging: Run RangerDTL on last 10 NOGs only
# NOG_ID_list = NOG_ID_list[-2:]; max_threads=2;
logger.info(f"Running RangerDTL with {max_threads} threads on {
len(NOG_ID_list)} NOGs, performing {n_runs} runs for each NOG.")
with Pool(processes=max_threads) as pool:
# iterate through what has been created via pool.imap()
for pool_out in pool.imap(run_RANGERDTL_on_NOG_wrapper, [(NOG_ID, tmp_dir, bin_dir, n_runs, ranger_bin) for NOG_ID in NOG_ID_list]):
if pool_out is not None:
logger.info(f"RangerDTL finished for {pool_out}")
# log time taken
end_time = time.time()
elapsed_time = end_time - start_time
logger.info(f"Finished running {__file__} at {time.asctime()}, took {
str(timedelta(seconds=elapsed_time))}")
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
../data/compiled_results
\ No newline at end of file
This diff is collapsed.
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment