rule estimateKmerCoverage: input: read1 = 'data/auxiliary/{dataset}/{id}'+'.qc_internal_R1.fq', read2 = 'data/auxiliary/{dataset}/{id}'+'.qc_internal_R2.fq' output: histogram = report('data/output/{dataset}/kmers/{kmer}/{id}/kmers.histo.png',category='Coverage-Based Method',caption='../report/kmerCoverageHistogram.rst'), histogramRaw = 'data/auxiliary/{dataset}/kmers/{kmer}/{id}/kmers.histo.raw.png', mean = 'data/auxiliary/{dataset}/kmers/{kmer}/{id}/coverage_estimate_kmercountbased.txt', kmererror = 'data/auxiliary/{dataset}/kmers/{kmer}/{id}/kmer_error.txt' params: k = lambda wildcards: wildcards.kmer, #cluster execution cpus = '1', #TODO: This could in theory be sped up significantly using a shared cache and multithreading gpus = '0', mem = '64G', walltime = '00:45:00' log: 'logs/{dataset}/kmers/{kmer}/{id}/estimateKmerCoverage.log' benchmark: 'benchmarks/{dataset}/kmers/{kmer}/{id}/estimateKmerCoverage.log' conda: '../envs/biopythonworkbench.yaml' script: '../scripts/estimateKmerCoverage.py' ''' rule estimateKmerCoverageFiltered: input: reads = 'data/auxiliary/{dataset}/{id}/filteredReads.fastq' output: histogram = 'data/auxiliary/{dataset}/kmers/{kmer}/{id}/kmers.histo.regionXOnly.png' params: k = lambda wildcards: wildcards.kmer #TODO: Threads = 2 ? log: 'logs/{dataset}/kmers/{kmer}/{id}/estimateKmerCoverageFiltered.log' conda: '../envs/main.yaml' script: '../scripts/estimateKmerCoverageFiltered.py' ''' rule estimateKmerCoverageAlignment: input: coverageEstimate = 'data/auxiliary/{dataset}/{id}/coverageEstimate.txt', readLengthEstimate = 'data/auxiliary/{dataset}/{id}/readLengthEstimate.txt', baseErrorEstimate = 'data/auxiliary/{dataset}/{id}/base_error_estimate.txt' output: kmerCoverage = 'data/auxiliary/{dataset}/kmers/{kmer}/{id}/coverage_estimate_alignmentbased.txt' params: k = lambda wildcards: wildcards.kmer, # cluster execution cpus = '1', gpus = '0', mem = '16G', walltime = '00:30:30' log: 'logs/{dataset}/kmers/{kmer}/{id}/estimateKmerCoverage.log' conda: '../envs/biopythonworkbench.yaml' script: '../scripts/estimateKmerCoverage_alignment.py' rule estimateCoverageAlignment: input: filteredAlignment = 'data/auxiliary/{dataset}/{id}/alignment.sorted.bam' output: coverageEstimate = 'data/auxiliary/{dataset}/{id}/coverageEstimate.txt' params: # cluster execution cpus = '1', gpus = '0', mem = '16G', walltime = '00:30:30' log: 'logs/{dataset}/{id}/estimateKmerCoverage_alignment.log' conda: '../envs/biopythonworkbench.yaml' shell: 'samtools depth {input.filteredAlignment} | awk \' $1 == "maskref" {{sum+=$3}} END {{ print "Average = ",sum/NR}}\' | grep -Eo \'[+-]?[0-9]+([.][0-9]+)?\' > {output.coverageEstimate}' rule calcPriorProbabilitiesCoverage: input: likelihoods = 'data/auxiliary/{dataset}/{iterset}/kmers/{kmer}/{id}/likelihoods_cov.json' output: priorFilePath = 'data/auxiliary/{dataset}/{iterset}/kmers/{kmer}/{id}/prior_cov.txt' params: k = lambda wildcards: wildcards.kmer, dps = config['dps'], # cluster execution cpus = '1', gpus = '0', mem = '2G', walltime = '00:05:30' log: 'logs/{dataset}/{iterset}/probabilistic/kmers/{kmer}/{id}/calcPrior_cov.log' conda: '../envs/biopythonworkbench.yaml' script: '../scripts/calcPriorProbabilities.py' rule calcProbabilitiesCoverage: input: likelihoods = 'data/auxiliary/{dataset}/{iterset}/kmers/{kmer}/{id}/likelihoods_cov.json', prior = 'data/auxiliary/{dataset}/{iterset}/kmers/{kmer}/{id}/prior_cov.txt' output: probabilities = 'data/auxiliary/{dataset}/{iterset}/kmers/{kmer}/{id}/scores.probabilistic_cov.tsv' params: dps = config['dps'], # cluster execution cpus = '1', gpus = '0', mem = '8G', walltime = '00:10:30' conda: '../envs/biopythonworkbench.yaml' log: 'logs/{dataset}/{iterset}/probabilistic/kmers/{kmer}/{id}/probabilities_cov.log' script: '../scripts/calcSpaTypeProbabilities.py' rule createFitnessPlots: input: counts = 'data/auxiliary/{dataset}/kmers/{kmer}/{id}/alignment.counts.json', probabilities = 'data/auxiliary/{dataset}/{iterset}/kmers/{kmer}/{id}/scores.probabilistic_cov.tsv', ratios = 'data/auxiliary/kmers/{kmer}/spaSequencesRatios.json' output: report('data/output/{dataset}/{iterset}/kmers/{kmer}/{id}_top3fit.svg',category='Coverage-Based Method',caption='../report/fitnessPlot.rst') params: dps = config['dps'], # cluster execution cpus = '1', gpus = '0', mem = '1G', walltime = '00:20:30' conda: '../envs/biopythonworkbench.yaml' script: '../scripts/createFitnessPlots.py' rule calcExpectedCounts: input: kmerCoverageEstimate = determineKmerCoverageEstimateFile(), counts = 'data/auxiliary/kmers/{kmer}/spaSequences.counts.json' output: 'data/auxiliary/{dataset}/kmers/{kmer}/{id}/expected_counts.json' params: k = lambda wildcards: wildcards.kmer, # cluster execution cpus = '1', mem = '8G', gpus = '0', walltime = '00:30:00' conda: '../envs/biopythonworkbench.yaml' script: '../scripts/calcExpectedCounts.py'