Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
C
Comparative Study of HGT Inference Methods
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Iterations
Wiki
Requirements
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Locked files
Build
Pipelines
Jobs
Pipeline schedules
Test cases
Artifacts
Deploy
Releases
Package registry
Container registry
Model registry
Operate
Environments
Terraform modules
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Code review analytics
Issue analytics
Insights
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
GitLab community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
general
ccb
Comparative Study of HGT Inference Methods
Commits
7adb3a5e
Commit
7adb3a5e
authored
2 months ago
by
Swastik Mishra
Browse files
Options
Downloads
Patches
Plain Diff
consider ARG genes Fisher exact
parent
baff63c3
No related branches found
No related tags found
No related merge requests found
Changes
1
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
03-analyse_outputs/lib/antibiotic_resistance_genes_analysis.py
+358
-0
358 additions, 0 deletions
...alyse_outputs/lib/antibiotic_resistance_genes_analysis.py
with
358 additions
and
0 deletions
03-analyse_outputs/lib/antibiotic_resistance_genes_analysis.py
0 → 100644
+
358
−
0
View file @
7adb3a5e
import
os
import
pandas
as
pd
import
numpy
as
np
from
scipy
import
stats
from
IPython.display
import
display
from
tqdm
import
tqdm
def
read_nogwise_data
(
compiled_res_dir
):
compiled_nogwise_filepaths
=
[
os
.
path
.
join
(
root
,
fi
)
for
root
,
_
,
files
in
os
.
walk
(
compiled_res_dir
)
for
fi
in
files
if
fi
.
startswith
(
"
compiled_transfers.nogwise.
"
)
and
"
branchwise
"
not
in
fi
]
compiled_nogwise_filepaths_dict
=
{
fi
.
partition
(
"
compiled_transfers.nogwise.
"
)[
2
].
partition
(
"
.tsv
"
)[
0
]:
fi
for
fi
in
compiled_nogwise_filepaths
}
compiled_nogwise_dfs
=
{
key
:
pd
.
read_csv
(
value
,
sep
=
"
\t
"
)
for
key
,
value
in
compiled_nogwise_filepaths_dict
.
items
()
}
# drop 'gloome_branch_name' column if it exists in any df
for
key
,
cn_df
in
compiled_nogwise_dfs
.
items
():
if
"
gloome_branch_name
"
in
cn_df
.
columns
:
cn_df
.
drop
(
columns
=
[
"
gloome_branch_name
"
],
inplace
=
True
)
# sort dfs by 'transfers'
try
:
compiled_nogwise_dfs
[
key
]
=
cn_df
.
sort_values
(
by
=
"
transfer_threshold
"
)
except
KeyError
as
e
:
raise
KeyError
(
f
"
KeyError for
{
key
}
:
{
e
}
"
)
return
compiled_nogwise_dfs
def
arg_thresholdwise_mwu
(
method
:
str
,
method_thresholded_df
:
pd
.
DataFrame
,
):
"""
This function takes a dataframe and performs a Mann-Whitney U test
to compare the
'
corrected_transfers
'
column between two groups:
ARG NOGs and non-ARG NOGs.
It returns a dictionary with the results of the MWU test, CLES, and mean values.
"""
# perform MWU test
arg_nogs
=
method_thresholded_df
[
method_thresholded_df
[
"
ARG
"
]
==
True
]
non_arg_nogs
=
method_thresholded_df
[
method_thresholded_df
[
"
ARG
"
]
==
False
]
# Check if both groups are non-empty
if
len
(
arg_nogs
)
<
5
:
# print("Warning: Not enough ARG NOGs for MWU test. Skipping method:", method,
# "and transfer threshold:", method_thresholded_df["transfer_threshold"].min())
return
{}
if
len
(
non_arg_nogs
)
<
5
:
# print("Warning: Not enough non-ARG NOGs for MWU test. Skipping method:", method,
# "and transfer threshold:", method_thresholded_df["transfer_threshold"].min())
return
{}
# Perform MWU test
mwu
,
comparison_p
=
stats
.
mannwhitneyu
(
arg_nogs
[
"
corrected_transfers
"
],
non_arg_nogs
[
"
corrected_transfers
"
],
alternative
=
"
two-sided
"
,
)
# Find the median values and lengths
arg_median
=
arg_nogs
[
"
corrected_transfers
"
].
median
()
non_arg_median
=
non_arg_nogs
[
"
corrected_transfers
"
].
median
()
arg_nogs_count
=
len
(
arg_nogs
)
non_arg_nogs_count
=
len
(
non_arg_nogs
)
# Calculate CLES
cles
=
mwu
/
(
arg_nogs_count
*
non_arg_nogs_count
)
# Create a dictionary to store the results and return it
comparison_res_dict
=
{
"
mwu
"
:
mwu
,
"
p_value
"
:
comparison_p
,
"
arg_median
"
:
arg_median
,
"
non_arg_median
"
:
non_arg_median
,
"
difference_in_medians
"
:
arg_median
-
non_arg_median
,
"
arg_nogs_count
"
:
arg_nogs_count
,
"
non_arg_nogs_count
"
:
non_arg_nogs_count
,
"
cles
"
:
cles
,
}
return
comparison_res_dict
def
arg_thresholdwise_fisher_exact
(
method
:
str
,
method_thresholded_df
:
pd
.
DataFrame
,
arg_ratio
:
tuple
=
(),
# (#arg, #non-arg): only if "fisher_exact"
):
"""
This function takes a dataframe and performs a Fisher
'
s exact test
to compare the number of NOGs with non-zero corrected transfers
between two groups: ARG NOGs and non-ARG NOGs, as compared to the
arg_ratio (the ratio of ARG NOGs to non-ARG NOGs in the entire dataset).
It returns a dictionary with the results of the Fisher
'
s exact test.
"""
# Check if the arg_ratio is valid
if
len
(
arg_ratio
)
!=
2
or
arg_ratio
[
0
]
<=
0
or
arg_ratio
[
1
]
<=
0
:
raise
ValueError
(
"
arg_ratio must be a tuple of two positive integers: (#arg, #non-arg)
"
)
# Check if the dataframe is empty
if
method_thresholded_df
.
empty
:
print
(
"
Warning: No data for method:
"
,
method
,
"
and transfer threshold:
"
,
method_thresholded_df
[
"
transfer_threshold
"
].
min
())
return
{}
# Create a contingency table
contingency_table
=
pd
.
DataFrame
({
"
ARG
"
:
[
method_thresholded_df
[(
method_thresholded_df
[
"
ARG
"
]
==
True
)
&
(
method_thresholded_df
[
"
corrected_transfers
"
]
>
0
)][
"
nog_id
"
].
nunique
(),
arg_ratio
[
0
]
],
"
non-ARG
"
:
[
method_thresholded_df
[(
method_thresholded_df
[
"
ARG
"
]
==
False
)
&
(
method_thresholded_df
[
"
corrected_transfers
"
]
>
0
)][
"
nog_id
"
].
nunique
(),
arg_ratio
[
1
]
]
},
index
=
[
"
observed
"
,
"
expected
"
])
# Perform Fisher's exact test
odds_ratio
,
comparison_p
=
stats
.
fisher_exact
(
contingency_table
,
alternative
=
"
greater
"
,
# greater: ratio of ARG NOGs to non-ARG NOGs is greater than expected
)
# Create a dictionary to store the results and return it
comparison_res_dict
=
{
"
odds_ratio
"
:
odds_ratio
,
"
p_value
"
:
comparison_p
,
"
arg_nogs_count
"
:
contingency_table
.
loc
[
"
observed
"
,
"
ARG
"
],
"
non_arg_nogs_count
"
:
contingency_table
.
loc
[
"
observed
"
,
"
non-ARG
"
],
"
arg_nogs_expected
"
:
contingency_table
.
loc
[
"
expected
"
,
"
ARG
"
],
"
non_arg_nogs_expected
"
:
contingency_table
.
loc
[
"
expected
"
,
"
non-ARG
"
],
"
total_nogs_count
"
:
contingency_table
.
loc
[
"
observed
"
].
sum
()
}
return
comparison_res_dict
def
arg_thresholdwise_test
(
method
:
str
,
method_thresholded_df
:
pd
.
DataFrame
,
arg_ratio
:
tuple
=
(),
# (#arg, #non-arg): only if "fisher_exact"
comparison_method
:
str
=
"
mwu
"
,
# "mwu", "fisher_exact"
):
valid_comparison_methods
=
[
"
mwu
"
,
"
fisher_exact
"
]
# depending on the comparison_method, we call the appropriate function
if
comparison_method
==
"
mwu
"
:
# perform MWU test
comparison_res_dict
=
arg_thresholdwise_mwu
(
method
=
method
,
method_thresholded_df
=
method_thresholded_df
,
)
elif
comparison_method
==
"
fisher_exact
"
:
# perform Fisher's exact test
comparison_res_dict
=
arg_thresholdwise_fisher_exact
(
method
=
method
,
method_thresholded_df
=
method_thresholded_df
,
arg_ratio
=
arg_ratio
,
)
else
:
raise
ValueError
(
f
"
Comparison method must be one of
{
valid_comparison_methods
}
"
)
# Check if the comparison_res_dict is empty
if
not
comparison_res_dict
:
# print("Warning: No data for method:", method, "and transfer threshold:", method_thresholded_df["transfer_threshold"].min())
return
{}
return
comparison_res_dict
def
arg_transferrability_analysis
(
compiled_nogwise_dfs
:
dict
,
correction_var
:
str
=
"
#genes
"
,
comparison_method
:
str
=
"
mwu
"
,
# "mwu", "fisher_exact"
arg_ratio
:
tuple
=
(),
# (#arg, #non-arg): only if "fisher_exact"
):
"""
This function takes a dictionary of dataframes and a correction variable.
For each method,df in the dictionary, it normalizes the
'
transfers
'
column by the correction variable.
Then at transfer_threshold in the
'
transfer_threshold
'
column,
it performs a test to compare the
'
corrected_transfers
'
column between two groups, if
is significantly different between the two groups of ARG NOGs and non-ARG NOGs.
Assuming that if any of the genes in a NOG is ARG, then the NOG is considered ARG.
Comparison methods are:
-
"
mwu
"
: Mann-Whitney U test comparing distributions of corrected transfers between the two groups
-
"
fisher_exact
"
: Fisher
'
s exact test comparing the number of NOGs with non-zero corrected transfers
The function returns a dataframe with the results of the tests.
"""
# Check if arg_ratio is not None if comparison_method is "fisher_exact"
if
comparison_method
==
"
fisher_exact
"
and
arg_ratio
==
():
raise
ValueError
(
"
valid arg_ratio must be provided if comparison_method is
'
fisher_exact
'"
)
all_method_comparison_res_records
=
[]
for
method
,
method_df
in
tqdm
(
compiled_nogwise_dfs
.
items
(),
desc
=
"
Processing methods
"
,
unit
=
"
method
"
):
# Check if the correction variable exists in the dataframe
if
correction_var
not
in
method_df
.
columns
:
raise
ValueError
(
f
"
Correction variable
'
{
correction_var
}
'
not found in dataframe for method
'
{
method
}
'"
)
# Normalize the 'transfers' column by the correction variable
method_df
[
"
corrected_transfers
"
]
=
method_df
[
"
transfers
"
]
/
method_df
[
correction_var
]
# remove nans
method_df
.
dropna
(
inplace
=
True
)
# Check if the transfer_threshold column is numeric
if
not
pd
.
api
.
types
.
is_numeric_dtype
(
method_df
[
"
transfer_threshold
"
]):
raise
ValueError
(
f
"
Transfer threshold column is not numeric for method
'
{
method
}
'"
)
# Perform MWU test, separately for each transfer threshold
if
method_df
[
"
transfer_threshold
"
].
unique
().
size
>
1
:
# if integer/integer-reciprocal transfer thresholds, these are gain-loss penalties
if
all
((
value
.
is_integer
()
or
round
((
1
/
value
)).
is_integer
())
for
value
in
method_df
[
"
transfer_threshold
"
].
unique
()
if
value
>
0
):
print
(
f
"
Processing method:
{
method
}
with integer/integer-reciprocal transfer thresholds
"
)
# Get the unique transfer thresholds and sort them
transfer_thresholds
=
method_df
[
"
transfer_threshold
"
].
unique
()
transfer_thresholds
.
sort
()
for
transfer_threshold
in
transfer_thresholds
:
threshold_df
=
method_df
[
method_df
[
"
transfer_threshold
"
]
==
transfer_threshold
].
copy
()
# display(threshold_df)
# the total num transfers is the sum of corrected_transfers for all NOGs
num_transfers
=
threshold_df
[
"
corrected_transfers
"
].
sum
()
# Check if the threshold_df is empty
if
threshold_df
.
empty
:
print
(
f
"
Warning: No data for transfer threshold
{
transfer_threshold
}
in method
{
method
}
. Skipping.
"
)
continue
threshold_comparison_res_dict
=
arg_thresholdwise_test
(
method
=
method
,
method_thresholded_df
=
threshold_df
,
arg_ratio
=
arg_ratio
,
comparison_method
=
comparison_method
,
)
# Store the results in the dictionary
if
threshold_comparison_res_dict
:
# add the method, transfer_threshold (two separate keys) and returned dict items to the results list
# so the list is structured as:
# [{'method': method, 'transfer_threshold': transfer_threshold, 'mwu': mwu, ...}, {'method': method, ...}, ...]
threshold_comparison_res_dict
[
"
method
"
]
=
method
threshold_comparison_res_dict
[
"
transfer_threshold
"
]
=
transfer_threshold
threshold_comparison_res_dict
[
"
num_transfers
"
]
=
num_transfers
all_method_comparison_res_records
.
append
(
threshold_comparison_res_dict
)
else
:
# these are not integer/integer-reciprocal transfer thresholds
print
(
f
"
Processing method:
{
method
}
with transfer thresholds that aren
'
t integer/integer-reciprocal
"
)
skipped_thresholds
=
[]
if
comparison_method
==
"
mwu
"
:
# we split the transfers themselves into 10 cumulative bins
# and find the min transfer_threshold for each bin
min_transfers
=
1
max_transfers
=
method_df
[
"
corrected_transfers
"
].
shape
[
0
]
# create 10 cumulative bins that fit better on a log-scale
# first split the range into 20 points and store in a list
edge_range
=
21
# 20 points
transfers_edges
=
[
max_transfers
-
(
(
max_transfers
-
min_transfers
)
*
(
0.5
**
i
)
)
for
i
in
range
(
edge_range
)]
print
(
f
"
For method
{
method
}
, the transfer edges are:
{
transfers_edges
}
"
)
# now for each of these edges e, take the threshold_df as the top e of the data
method_df
=
method_df
.
sort_values
(
by
=
"
transfer_threshold
"
,
ascending
=
False
)
for
i
in
range
(
edge_range
):
# get the top e rows of the dataframe
threshold_df
=
method_df
.
head
(
int
(
np
.
floor
(
transfers_edges
[
i
]))).
copy
()
transfer_threshold
=
threshold_df
[
"
transfer_threshold
"
].
min
()
# the number of transfers is the sum of corrected_transfers for all NOGs
# but note that generally for the methods in this section
# the 'transfer_threshold' is
# not cumulative but only an expectation for each NOG
num_transfers
=
threshold_df
[
"
corrected_transfers
"
].
sum
()
# Check if the threshold_df is empty
if
threshold_df
.
empty
:
print
(
f
"
Warning: No data for transfer threshold
{
transfer_threshold
}
in method
{
method
}
.
"
f
"
Number of NOGs:
{
len
(
threshold_df
)
}
for edge
{
transfers_edges
[
i
]
}
. Skipping.
"
)
continue
threshold_comparison_res_dict
=
arg_thresholdwise_test
(
method
=
method
,
method_thresholded_df
=
threshold_df
,
arg_ratio
=
arg_ratio
,
comparison_method
=
comparison_method
,
)
# Store the results in the dictionary
if
threshold_comparison_res_dict
:
# add the method, transfer_threshold (two separate keys) and returned dict items to the results list
# so the list is structured as:
# [{'method': method, 'transfer_threshold': transfer_threshold, 'mwu': mwu, ...}, {'method': method, ...}, ...]
threshold_comparison_res_dict
[
"
method
"
]
=
method
threshold_comparison_res_dict
[
"
transfer_threshold
"
]
=
transfer_threshold
all_method_comparison_res_records
.
append
(
threshold_comparison_res_dict
)
else
:
skipped_thresholds
.
append
(
transfer_threshold
)
print
(
f
"
Warning: For method
{
method
}
, skipped transfer thresholds:
{
skipped_thresholds
}
"
)
elif
comparison_method
==
"
fisher_exact
"
:
# we split the #nogs into 10 cumulative bins
# and find the min transfer_threshold for each bin
# first split the range into 20 points and store in a list
edge_range
=
21
# 20 points
num_nogs_edges
=
[
method_df
[
"
nog_id
"
].
nunique
()
-
(
(
method_df
[
"
nog_id
"
].
nunique
()
-
1
)
*
(
0.5
**
i
)
)
for
i
in
range
(
edge_range
)
]
print
(
f
"
For method
{
method
}
, the NOG edges are:
{
num_nogs_edges
}
"
)
# now for each of these edges e, take the threshold_df as the top e number of NOGs from the data
method_df
=
method_df
.
sort_values
(
by
=
"
transfer_threshold
"
,
ascending
=
False
)
for
i
in
range
(
edge_range
):
# get the rows corresponding to the first e nogs from the dataframe
selected_nog_ids
=
method_df
[
"
nog_id
"
].
unique
()[:
int
(
np
.
floor
(
num_nogs_edges
[
i
]))]
threshold_df
=
method_df
[
method_df
[
"
nog_id
"
].
isin
(
selected_nog_ids
)
].
copy
()
transfer_threshold
=
threshold_df
[
"
transfer_threshold
"
].
min
()
# Check if the threshold_df is empty
if
threshold_df
.
empty
:
print
(
f
"
Warning: No data for transfer threshold
{
transfer_threshold
}
in method
{
method
}
.
"
f
"
Number of NOGs:
{
len
(
threshold_df
)
}
for edge
{
num_nogs_edges
[
i
]
}
. Skipping.
"
)
continue
threshold_comparison_res_dict
=
arg_thresholdwise_test
(
method
=
method
,
method_thresholded_df
=
threshold_df
,
arg_ratio
=
arg_ratio
,
comparison_method
=
comparison_method
,
)
# Store the results in the dictionary
if
threshold_comparison_res_dict
:
# add the method, transfer_threshold (two separate keys) and returned dict items to the results list
# so the list is structured as:
# [{'method': method, 'transfer_threshold': transfer_threshold, 'mwu': mwu, ...}, {'method': method, ...}, ...]
threshold_comparison_res_dict
[
"
method
"
]
=
method
threshold_comparison_res_dict
[
"
transfer_threshold
"
]
=
transfer_threshold
all_method_comparison_res_records
.
append
(
threshold_comparison_res_dict
)
else
:
skipped_thresholds
.
append
(
transfer_threshold
)
print
(
f
"
Warning: For method
{
method
}
, skipped transfer thresholds:
{
skipped_thresholds
}
"
)
elif
method_df
[
"
transfer_threshold
"
].
unique
().
size
==
1
:
# if there is only one transfer threshold, we can just use the whole dataframe
threshold_df
=
method_df
.
copy
()
# test
threshold_comparison_res_dict
=
arg_thresholdwise_test
(
method
=
method
,
method_thresholded_df
=
threshold_df
,
arg_ratio
=
arg_ratio
,
comparison_method
=
comparison_method
,
)
# Store the results in the dictionary
if
threshold_comparison_res_dict
:
# add the method, transfer_threshold (two separate keys) and returned dict items to the results list
# so the list is structured as:
# [{'method': method, 'transfer_threshold': transfer_threshold, 'mwu': mwu, ...}, {'method': method, ...}, ...]
threshold_comparison_res_dict
[
"
method
"
]
=
method
threshold_comparison_res_dict
[
"
transfer_threshold
"
]
=
method_df
[
"
transfer_threshold
"
].
min
()
all_method_comparison_res_records
.
append
(
threshold_comparison_res_dict
)
else
:
# if method_df["transfer_threshold"].unique().size == 0:
print
(
f
"
Warning: No data for transfer threshold in method
{
method
}
. Skipping.
"
)
continue
# Create a dataframe from the results list
all_method_comparison_res_df
=
pd
.
DataFrame
(
all_method_comparison_res_records
)
# Sort the dataframe by method and transfer_threshold
all_method_comparison_res_df
.
sort_values
(
by
=
[
"
method
"
,
"
transfer_threshold
"
],
inplace
=
True
)
# Reset the index
all_method_comparison_res_df
.
reset_index
(
drop
=
True
,
inplace
=
True
)
# return the dataframe
return
all_method_comparison_res_df
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment