Skip to content
GitLab
Menu
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Sign in
Toggle navigation
Menu
Open sidebar
Jan Hoeckesfeld
Snakemake Ngs Spa Typing
Commits
41c8840a
Commit
41c8840a
authored
Nov 17, 2020
by
Jan Hoeckesfeld
Browse files
added possibility to compute distance matrix over the V kmers
parent
120b51b5
Changes
3
Hide whitespace changes
Inline
Side-by-side
envs/biopythonworkbench.yaml
View file @
41c8840a
...
...
@@ -13,6 +13,8 @@ dependencies:
-
samtools =
1.11
-
seaborn = 0.11.0
-
scipy = 1.5.2
-
scikit-learn = 0.23.2
-
numpy = 1.18.5
-
pysam = 0.16.0.1
-
mpmath = 1.1.0
-
matplotlib-venn
=
0.11.5
-
matplotlib-venn
=
0.11.5
rules/kmerApproach.snk
View file @
41c8840a
...
...
@@ -452,3 +452,21 @@ rule createRatios:
script:
'../scripts/calculateKmerRatios.py'
rule createDistanceMatrixOverKmersOfV:
input:
kmers = 'data/auxiliary/kmers/{kmer}/spaSequences.counts.json'
output:
V_kmers_distances = 'data/auxiliary/kmers/{kmer}/V_kmers.distances.npz'
params:
k = lambda wildcards: wildcards.kmer,
hamming_distance_cutoff = 5,
#cluster execution
cpus = '2',
mem = '4G',
gpus = '0',
walltime = '00:15:00'
conda:
'../envs/biopythonworkbench.yaml'
script:
'../scripts/calcDistanceMatrixVkmers.py'
scripts/calcDistanceMatrixVkmers.py
0 → 100644
View file @
41c8840a
import
json
import
numpy
as
np
import
time
from
scipy.sparse
import
coo_matrix
,
vstack
,
save_npz
from
sklearn.metrics
import
pairwise_distances_chunked
,
pairwise_distances
##############INPUT######################################
# eg "spaSequences.counts.json"
f
=
open
(
snakemake
.
input
[
'kmers'
],
"r"
)
expected_json
=
f
.
read
()
person_dict
=
json
.
loads
(
expected_json
)
kmers
=
{
k
for
kmers
in
person_dict
.
values
()
for
k
in
kmers
.
keys
()}
kmers
=
sorted
(
kmers
)
n
=
len
(
kmers
)
m
=
len
(
kmers
[
0
])
#########################################################
# eg 5
ALLOWED_DISTANCE
=
int
(
snakemake
.
params
[
'hamming_distance_cutoff'
])
/
m
def
reduce_func
(
D_chunk
,
start
):
neigh
=
np
.
where
(
D_chunk
<
ALLOWED_DISTANCE
,
D_chunk
,
0
)
return
(
neigh
*
m
).
astype
(
'u1'
)
base_dict
=
{
"A"
:
0
,
"C"
:
1
,
"G"
:
2
,
"T"
:
3
}
def
base_to_int
(
c
):
return
base_dict
[
c
]
def
kmer_to_int
(
v
):
return
[
base_to_int
(
c
)
for
c
in
v
]
def
kmers_to_int
(
kmers
):
return
[
kmer_to_int
(
k
)
for
k
in
kmers
]
# sklearn only computes integer hamming distances
kmers_int
=
kmers_to_int
(
kmers
)
M
=
coo_matrix
((
0
,
n
),
dtype
=
'u1'
)
print
(
M
.
shape
)
# To big without cutoff. If dtype 'u1' is used, still about 10 GB:
# print(pairwise_distances(kmers_int, metric="hamming"))
cpus
=
-
1
# working_memory = 1024
# eg snakemake.params['mem'] = 1G
working_memory
=
int
(
snakemake
.
params
[
'mem'
][:
-
1
])
*
1000
-
500
gen
=
pairwise_distances_chunked
(
kmers_int
,
reduce_func
=
reduce_func
,
metric
=
"hamming"
,
n_jobs
=
cpus
,
working_memory
=
working_memory
)
N
=
next
(
gen
)
start
=
time
.
time
()
report_after_every
=
n
/
10
count
=
0
while
N
is
not
None
:
M
=
vstack
([
M
,
N
])
count
+=
N
.
shape
[
0
]
try
:
N
=
next
(
gen
)
except
StopIteration
:
break
if
count
>
report_after_every
:
print
(
"---------------------------------------------"
)
print
(
"Percent done: "
,
M
.
shape
[
0
]
/
M
.
shape
[
1
]
*
100
,
" "
,
M
.
shape
[
0
],
" of "
,
M
.
shape
[
1
],
"Rows"
)
print
(
"Time spend: "
,
time
.
time
()
-
start
,
"s"
)
count
=
0
############## OUTPUT ###################################
end
=
time
.
time
()
print
(
"-----------------RESULT----------------------"
)
print
(
"Time spend: "
,
time
.
time
()
-
start
,
"s"
)
print
(
"Matrix shape:"
,
M
.
shape
)
print
(
"Density of sparse matrix: "
,
M
.
getnnz
()
/
np
.
prod
(
M
.
shape
))
M
=
M
.
tocoo
()
# eg 'V_kmer_distances.npz'
save_npz
(
snakemake
.
output
[
'V_kmers_distances'
],
M
)
# https://github.com/rogersce/cnpy
# https://stackoverflow.com/questions/36433030/load-scipy-sparse-csr-matrix-in-c
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment