Commit 43c43755 authored by Jan Hoeckesfeld's avatar Jan Hoeckesfeld
Browse files

added venndiagrams

parent 7b196446
......@@ -39,6 +39,7 @@ def get_input():
input_list.append(expand('data/output/kmers/{kmer}/uniquenessTest.tsv',kmer=kmer_lengths))
if config['kmer_stats_analysis']:
input_list.append(expand('data/auxiliary/'+config['input_folder']+'/kmers/{kmer}/{id}/stats.tsv',kmer=kmer_lengths,id=inputIDs))
input_list.append(expand('data/output/'+config['input_folder']+'/kmers/{kmer}/{id}/spaTypesGroundTruthVennDia.svg',kmer=kmer_lengths,id=inputIDs))
return input_list
......
......@@ -9,9 +9,10 @@ dependencies:
- biopython=1.71
- python=3.7
- regex
- openssl = 1.0
- samtools = 1.9
- seaborn = 0.9.0
- scipy = 1.2.1
- pysam = 0.15.2
- openssl = 1.1.1h
- samtools = 1.11
- seaborn = 0.11.0
- scipy = 1.5.2
- pysam = 0.16.0.1
- mpmath = 1.1.0
- matplotlib-venn=0.11.5
......@@ -340,8 +340,25 @@ rule calcAverageReadLength:
'../envs/biopythonworkbench.yaml'
shell:
'awk \' {{ if(NR%4==2) {{count++; bases += length}} }} END{{print bases/count}} \' {input.read1} > {output}' #todo: use both read files?
rule createSpaTypeVennDiagram:
input:
expected = 'data/auxiliary/'+config['input_folder']+'/kmers/{kmer}/{id}/expected_counts.json',
observed = 'data/auxiliary/'+config['input_folder']+'/kmers/{kmer}/{id}/alignment.counts.json',
scores = 'data/auxiliary/'+config['input_folder']+'/kmers/{kmer}/{id}/scores.probabilistic_cov.tsv'
output:
venngtt = 'data/output/'+config['input_folder']+'/kmers/{kmer}/{id}/spaTypesGroundTruthVennDia.svg',
venntopsix = 'data/output/'+config['input_folder']+'/kmers/{kmer}/{id}/spaTypesTopSixVennDia.svg',
vennrandomsix = 'data/output/'+config['input_folder']+'/kmers/{kmer}/{id}/spaTypesRandomSixVennDia.svg'
params:
gtt = lambda wildcards : getGroundTruthType(wildcards.id)
log:
'logs/'+config['input_folder']+'/probabilistic/kmers/{kmer}/{id}/spaTypeVennDia.log'
conda:
'../envs/biopythonworkbench.yaml'
script:
'../scripts/spaTypeVennDia.py'
rule createValidKmerHistogram:
......
......@@ -64,7 +64,7 @@ rule calcLikelihoods:
input:
expected = 'data/auxiliary/'+config['input_folder']+'/kmers/{kmer}/{id}/expected_counts.json',
observed = 'data/auxiliary/'+config['input_folder']+'/kmers/{kmer}/{id}/alignment.counts.json',
iterationset = determineIterationset({id}}),
#iterationset = determineIterationset({id}}),
kmerError = 'data/auxiliary/'+config['input_folder']+'/kmers/{kmer}/{id}/kmer_error.txt',
kmerCoverageEstimate = determineKmerCoverageEstimateFile()
output:
......
import json
import logging
import random
from textwrap import wrap
import regex as re
from matplotlib import pyplot as plt
from matplotlib_venn import venn3
logging.basicConfig(filename=snakemake.log[0], level=logging.DEBUG, format="%(asctime)s:%(levelname)s:%(message)s")
# Read counts
expected_counts = json.load(open(snakemake.input['expected'], 'r'))
actual_counts = json.load(open(snakemake.input['observed'], 'r'))
# without assessing the actual counts! just if it exists!
V = set(k for spaTypeID in expected_counts for k in expected_counts[spaTypeID])
O = set(k for k in actual_counts)
def generate_six_venndiagrams(type_counts, type_names, filepath):
figure, axes = plt.subplots(2, 3, figsize=(11.69, 8.27))
x = [0, 0, 0, 1, 1, 1]
y = [0, 1, 2, 0, 1, 2]
for i in range(6):
v = venn3([O, type_counts[i], V], set_labels=('O', type_names[i], 'V'),
set_colors=('darkorange', 'blue', 'green'), alpha=0.8,
ax=axes[x[i]][y[i]])
for idx, subset in enumerate(v.subset_labels):
if subset is not None:
v.subset_labels[idx].set_visible(True)
axes[1, 2].axis('off')
figure.savefig(filepath)
##############################################################
# Si = groundtruth spatype!
Si = set(k for k in expected_counts[snakemake.params['gtt']])
fig = plt.figure()
v = venn3([O, Si, V], set_labels=('O', 'Si', 'V'), set_colors=('darkorange', 'blue', 'green'), alpha=0.8)
fig.suptitle(
"\n".join(wrap("Distribution of kmers in spa-type " + snakemake.params['gtt'] + " (expected spa-type)", 60)))
fig.savefig(snakemake.output['venngtt'])
################################################################
# take the 6 most feasible spatypes and plot their venn diagrams!
f = open(snakemake.input['scores'], 'r')
t = []
type_names = []
type_kmer_counts = []
for i, l in enumerate(f.readlines()):
if i > 5:
break
t.append(re.split(r'\t+', l.rstrip('\t\n')))
type_names.append(t[i][0])
type_kmer_counts.append(set(k for k in expected_counts[type_names[i]]))
generate_six_venndiagrams(type_kmer_counts, type_names, snakemake.output['venntopsix'])
##################################################################
# random six types
random_spatypes = random.sample(list(expected_counts), 6)
type_kmer_counts = [set(k for k in expected_counts[s]) for s in random_spatypes]
generate_six_venndiagrams(type_kmer_counts, random_spatypes, snakemake.output['vennrandomsix'])
##################################################################
\ No newline at end of file
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment