Commit 41c8840a authored by Jan Hoeckesfeld's avatar Jan Hoeckesfeld
Browse files

added possibility to compute distance matrix over the V kmers

parent 120b51b5
......@@ -13,6 +13,8 @@ dependencies:
- samtools = 1.11
- seaborn = 0.11.0
- scipy = 1.5.2
- scikit-learn = 0.23.2
- numpy = 1.18.5
- pysam = 0.16.0.1
- mpmath = 1.1.0
- matplotlib-venn=0.11.5
- matplotlib-venn = 0.11.5
......@@ -452,3 +452,21 @@ rule createRatios:
script:
'../scripts/calculateKmerRatios.py'
rule createDistanceMatrixOverKmersOfV:
input:
kmers = 'data/auxiliary/kmers/{kmer}/spaSequences.counts.json'
output:
V_kmers_distances = 'data/auxiliary/kmers/{kmer}/V_kmers.distances.npz'
params:
k = lambda wildcards: wildcards.kmer,
hamming_distance_cutoff = 5,
#cluster execution
cpus = '2',
mem = '4G',
gpus = '0',
walltime = '00:15:00'
conda:
'../envs/biopythonworkbench.yaml'
script:
'../scripts/calcDistanceMatrixVkmers.py'
import json
import numpy as np
import time
from scipy.sparse import coo_matrix, vstack, save_npz
from sklearn.metrics import pairwise_distances_chunked, pairwise_distances
##############INPUT######################################
# eg "spaSequences.counts.json"
f = open(snakemake.input['kmers'], "r")
expected_json = f.read()
person_dict = json.loads(expected_json)
kmers = {k for kmers in person_dict.values() for k in kmers.keys()}
kmers = sorted(kmers)
n = len(kmers)
m = len(kmers[0])
#########################################################
# eg 5
ALLOWED_DISTANCE = int(snakemake.params['hamming_distance_cutoff'])/m
def reduce_func(D_chunk, start):
neigh = np.where(D_chunk < ALLOWED_DISTANCE, D_chunk, 0)
return (neigh * m).astype('u1')
base_dict = {"A": 0, "C": 1, "G": 2, "T": 3}
def base_to_int(c):
return base_dict[c]
def kmer_to_int(v):
return [base_to_int(c) for c in v]
def kmers_to_int(kmers):
return [kmer_to_int(k) for k in kmers]
# sklearn only computes integer hamming distances
kmers_int = kmers_to_int(kmers)
M = coo_matrix((0, n), dtype='u1')
print(M.shape)
# To big without cutoff. If dtype 'u1' is used, still about 10 GB:
# print(pairwise_distances(kmers_int, metric="hamming"))
cpus = -1
# working_memory = 1024
# eg snakemake.params['mem'] = 1G
working_memory = int(snakemake.params['mem'][:-1]) * 1000 - 500
gen = pairwise_distances_chunked(kmers_int, reduce_func=reduce_func, metric="hamming", n_jobs=cpus,
working_memory=working_memory)
N = next(gen)
start = time.time()
report_after_every = n / 10
count = 0
while N is not None:
M = vstack([M, N])
count += N.shape[0]
try:
N = next(gen)
except StopIteration:
break
if count > report_after_every:
print("---------------------------------------------")
print("Percent done: ", M.shape[0] / M.shape[1] * 100, " ", M.shape[0], " of ", M.shape[1], "Rows")
print("Time spend: ", time.time() - start, "s")
count = 0
############## OUTPUT ###################################
end = time.time()
print("-----------------RESULT----------------------")
print("Time spend: ", time.time() - start, "s")
print("Matrix shape:", M.shape)
print("Density of sparse matrix: ", M.getnnz() / np.prod(M.shape))
M = M.tocoo()
# eg 'V_kmer_distances.npz'
save_npz(snakemake.output['V_kmers_distances'], M)
# https://github.com/rogersce/cnpy
# https://stackoverflow.com/questions/36433030/load-scipy-sparse-csr-matrix-in-c
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment