Commit c7246bda authored by Philipp Spohr's avatar Philipp Spohr
Browse files

Added software

parent 7ffe33b6
rule distance:
input:
readProfile = 'data/auxiliary/'+config['input_folder']+'/kmers/{kmer}/{id}/alignment.profile.json',
spaProfiles = 'data/auxiliary/kmers/{kmer}/spaSequences.kmerprofiles.json'
output:
'data/auxiliary/'+config['input_folder']+'/kmers/{kmer}/{id}/scores.euclidean.tsv'
conda:
'../envs/biopythonworkbench.yaml'
params:
# cluster execution
cpus = '1',
mem = '4G',
gpus = '0',
walltime = '00:10:00'
script:
'../scripts/kmerdiff.py'
rule detectDuplicates_repeats:
input:
repeats = 'data/input/'+config['spa_repeats_file']
output:
out = 'data/output/duplicate_repeats'
conda:
'envs/biopythonworkbench.yaml'
script:
'scripts/searchDuplicates.py'
rule detectDuplicates_sequences:
input:
repeats = 'data/auxiliary/spaSequences.fa'
output:
out = 'data/output/duplicate_sequences'
conda:
'envs/main.yaml'
script:
'scripts/searchDuplicates.py'
rule detectDuplicates_definitions:
input:
types = 'data/input/'+config['spa_types']
output:
out = 'data/output/duplicate_definitions'
conda:
'envs/biopythonworkbench.yaml'
script:
'scripts/searchDuplicateDefinitions.py'
# Returns the correct filename in which the required information is stored depending on the configuration setting
def determineKmerCoverageEstimateFile():
if config['kmerCoverageEstimationMethod'] == 'alignment':
return 'data/auxiliary/'+config['input_folder']+'/kmers/{kmer}/{id}/coverage_estimate_alignmentbased.txt'
elif config['kmerCoverageEstimationMethod'] == 'countMean':
return 'data/auxiliary/'+config['input_folder']+'/kmers/{kmer}/{id}/coverage_estimate_kmercountbased.txt'
elif config['kmerCoverageEstimationMethod'] == 'countPoisson':
return 'data/auxiliary/'+config['input_folder']+'/kmers/{kmer}/{id}/coverage_estimate_kmercountbased.txt'
# Returns the corresponding ground truth spa-type for a given file (sample) id
def getGroundTruthType(fid):
with open('data/input/groundTruth.tsv','r') as gtfile:
for l in gtfile.read().splitlines():
data = l.split('\t')
if data[0] == fid:
return data[1]
else:
raise WorkflowError("Can't find {} in ground truth file ... check that an entry exists!".format(fid))
### Mapping ###
rule extractMaskedReferenceGenome:
input:
refg = 'data/input/'+config['reference_genome']+'/'+config['genome_file_identifier'],
pt = 'data/input/'+config['reference_genome']+'/'+config['protein_table_identifier']
output:
main = 'data/auxiliary/maskedRef.fa'
params:
pid = config['protein_a_identifier'],
#cluster execution
cpus = '1',
mem = '8G',
gpus = '0',
walltime = '00:05:00'
conda:
'../envs/biopythonworkbench.yaml'
script:
'../scripts/maskProteinA.py'
rule createSyntheticProteinAs:
input:
proteinAFrame = 'data/input/proteinAFrame.ref',
spaSequences = 'data/auxiliary/spaSequences.fa'
output:
seqs = 'data/auxiliary/syntheticProteinAs.fa',
metaInf = 'data/auxiliary/syntheticProteinAs.meta'
params:
#cluster execution
cpus = '1',
mem = '8G',
gpus = '0',
walltime = '00:05:00'
conda:
'../envs/biopythonworkbench.yaml'
script:
'../scripts/createSyntheticProteinAs.py'
rule concatRelevantSequences:
input:
synth = 'data/auxiliary/syntheticProteinAs.fa',
masked = 'data/auxiliary/maskedRef.fa'
output:
'data/auxiliary/matchBoard.fa'
params:
#cluster execution
cpus = '1',
mem = '8G',
gpus = '0',
walltime = '00:02:00'
shell:
'cat {input.masked} {input.synth} > {output}'
rule bwa:
input:
bwi = 'data/auxiliary/matchBoard.fa.bwt',
mb = 'data/auxiliary/matchBoard.fa',
read1 = 'data/auxiliary/' + config['input_folder'] + '/{id}' + '.qc' + config['input_read_1_ending'],
read2 = 'data/auxiliary/' + config['input_folder'] + '/{id}' + '.qc' + config['input_read_2_ending']
output:
'data/auxiliary/'+config['input_folder']+'/{id}/alignment.bam'
params:
#cluster execution
cpus = '1',
mem = '32G',
gpus = '0',
walltime = '00:30:00'
singularity:
'docker://biocontainers/bwa:v0.7.17-3-deb_cv1'
shell:
'bwa mem {input.mb} {input.read1} {input.read2} -o {output}'
#'bwa mem {input.mb} {input.read1} -o {output}'
rule determineStrandBias:
input:
alignment = 'data/auxiliary/'+config['input_folder']+'/{id}/alignment.sorted.bam',
idx = 'data/auxiliary/'+config['input_folder']+'/{id}/alignment.sorted.bam.bai'
output:
strandbias = report('data/output/'+config['input_folder']+'/{id}/strandbias.txt',category='Strand Bias')
params:
#cluster execution
cpus = '1',
mem = '32G',
gpus = '0',
walltime = '00:30:00'
conda:
'../envs/biopythonworkbench.yaml'
script:
'../scripts/determineStrandBias.py'
rule filter_primary_matches:
input:
alignment = 'data/auxiliary/'+config['input_folder']+'/{id}/alignment.sorted.bam',
whitelist = 'data/auxiliary/syntheticProteinAs.meta'
output:
'data/auxiliary/'+config['input_folder']+'/{id}/alignment.filtered.bam'
params:
#cluster execution
cpus = '1',
mem = '16G',
gpus = '0',
walltime = '00:10:00'
conda:
'../envs/biopythonworkbench.yaml'
shell:
'samtools view -F 4 {input.alignment} -L {input.whitelist} -b > {output}'
rule determineFilteredStrandBias:
input:
alignment = 'data/auxiliary/'+config['input_folder']+'/{id}/alignment.filtered.bam',
idx = 'data/auxiliary/'+config['input_folder']+'/{id}/alignment.filtered.bam.bai'
output:
strandbias = 'data/auxiliary/'+config['input_folder']+'/{id}/strandbias.filtered.txt'
params:
#cluster execution
cpus = '1',
mem = '16G',
gpus = '0',
walltime = '00:30:00'
conda:
'../envs/biopythonworkbench.yaml'
script:
'../scripts/determineStrandBias.py'
rule extractFilteredReadsAsFastQ:
input:
filteredAlignment = 'data/auxiliary/'+config['input_folder']+'/{id}/alignment.filtered.bam'
output:
filteredReads = 'data/auxiliary/'+config['input_folder']+'/{id}/filteredReads.fastq'
params:
#cluster execution
cpus = '1',
mem = '8G',
gpus = '0',
walltime = '00:10:00'
conda:
'../envs/biopythonworkbench.yaml'
shell:
'samtools fastq {input.filteredAlignment} > {output.filteredReads}'
rule createKmerDistributionGroundTruth_COVERAGE_BASED:
input:
expectedCounts = 'data/auxiliary/'+config['input_folder']+'/kmers/{kmer}/{id}/expected_counts.json',
observedCounts = 'data/auxiliary/'+config['input_folder']+'/kmers/{kmer}/{id}/alignment.counts.json',
probabilities = 'data/auxiliary/'+config['input_folder']+'/kmers/{kmer}/{id}/scores.probabilistic_cov.tsv',
kmerError = 'data/auxiliary/'+config['input_folder']+'/kmers/{kmer}/{id}/kmer_error.txt'
output:
errors = 'data/output/'+config['input_folder']+'/methodAnalysis/{kmer}/{id}/kmerErrorDistributions.svg',
deviations = 'data/output/'+config['input_folder']+'/methodAnalysis/{kmer}/{id}/countDeviations.svg'
params:
gt = lambda wildcards : getGroundTruthType(wildcards.id),
#cluster execution
cpus = '1',
mem = '64G',
gpus = '0',
walltime = '00:30:00'
conda:
'../envs/biopythonworkbench.yaml'
log:
'logs/'+config['input_folder']+'/methodAnalysis/{kmer}/{id}/kmerErrorDistributions.svg'
script:
'../scripts/createKmerErrorDistributionPlots.py'
rule likelihoodAnalysis_COVERAGE_BASED:
input:
expectedCounts = 'data/auxiliary/'+config['input_folder']+'/kmers/{kmer}/{id}/expected_counts.json',
observedCounts = 'data/auxiliary/'+config['input_folder']+'/kmers/{kmer}/{id}/alignment.counts.json',
probabilities = 'data/auxiliary/'+config['input_folder']+'/kmers/{kmer}/{id}/scores.probabilistic_cov.tsv',
kmerError = 'data/auxiliary/'+config['input_folder']+'/kmers/{kmer}/{id}/kmer_error.txt'
output:
likelihoodAnalysis = 'data/output/'+config['input_folder']+'/methodAnalysis/{kmer}/{id}/likelihoodAnalysis.txt'
params:
gt = lambda wildcards : getGroundTruthType(wildcards.id)
conda:
'../envs/biopythonworkbench.yaml'
log:
'logs/'+config['input_folder']+'/methodAnalysis/{kmer}/{id}/likelihoodAnalysis.txt'
script:
'../scripts/likelihoodBreakdown.py'
rule mapAgainstGroundTruth:
input:
filteredReads = 'data/auxiliary/'+config['input_folder']+'/{id}/filteredReads.fastq',
groundTruthSequence = lambda wildcards: 'data/input/ref/'+getGroundTruthType(wildcards.id)+'.fa',
groundTruthIndex = lambda wildcards: 'data/input/ref/'+getGroundTruthType(wildcards.id)+'.fa.bwt'
output:
'data/output/'+config['input_folder']+'/methodAnalysis/{id}/alignmentToGroundTruthType.bam'
params:
# cluster execution
cpus = '1',
mem = '64G',
gpus = '0',
walltime = '00:30:00'
singularity:
'docker://biocontainers/bwa:v0.7.17-3-deb_cv1'
shell:
'bwa mem {input.groundTruthSequence} {input.filteredReads} -o {output}'
rule verifyUniqueness:
input:
kmerCounts = 'data/auxiliary/kmers/{kmer}/spaSequences.counts.json',
maskedReference = 'data/auxiliary/maskedRef.fa'
output:
report('data/output/kmers/{kmer}/uniquenessTest.tsv',category='kmerUniqueness')
conda:
'../envs/biopythonworkbench.yaml'
params:
k = lambda wildcards : wildcards.kmer,
# cluster execution
cpus = '1',
mem = '8G',
gpus = '0',
walltime = '00:30:00'
script:
'../scripts/verifyUniqueness.py'
rule analyzeMapping:
input:
read1 = 'data/auxiliary/'+config['input_folder']+'/{id}'+'.qc'+config['input_read_1_ending'],
read2 = 'data/auxiliary/'+config['input_folder']+'/{id}'+'.qc'+config['input_read_2_ending'],
filteredAlignment = 'data/auxiliary/'+config['input_folder']+'/{id}/alignment.filtered.bam',
idx = 'data/auxiliary/'+config['input_folder']+'/{id}/alignment.filtered.bam.bai',
metaInf = lambda wildcards : 'data/input/'+config['input_folder']+'/syntheticReferencesMetaData/'+getGroundTruthType(wildcards.id)+'.meta'
output:
correctAlignments = 'data/output/'+config['input_folder']+'/methodAnalysis/{id}/correctMapping.fa',
analysis = 'data/output/'+config['input_folder']+'/methodAnalysis/{id}/mapping.comparison'
conda:
'../envs/biopythonworkbench.yaml'
params:
# cluster execution
cpus = '1',
mem = '1G',
gpus = '0',
walltime = '00:10:00'
script:
'../scripts/analyzeMapping.py'
rule makeKmerProfilesFromTrueReads:
input:
filteredReads = 'data/output/'+config['input_folder']+'/methodAnalysis/{id}/correctMapping.fa',
regionXMetaData = lambda wildcards : 'data/input/'+config['input_folder']+'/syntheticReferencesMetaData/'+getGroundTruthType(wildcards.id)+'.meta'
output:
counts = 'data/output/'+config['input_folder']+'/methodAnalysis/{id}/{kmer}/correctCounts.json',
origins = 'data/output/'+config['input_folder']+'/methodAnalysis/{id}/{kmer}/kmerOrigins.json'
params:
k = lambda wildcards: wildcards.kmer
conda:
'../envs/biopythonworkbench.yaml'
script:
'../scripts/makeKmerProfilesFromTrueReads.py'
rule compareObservedKmerProfileToTrueProfile:
input:
trueCounts = 'data/output/'+config['input_folder']+'/methodAnalysis/{id}/{kmer}/correctCounts.json',
observedCounts = 'data/auxiliary/'+config['input_folder']+'/kmers/{kmer}/{id}/alignment.counts.json'
output:
differences = 'data/output/'+config['input_folder']+'/methodAnalysis/{id}/{kmer}/differences_observed.txt'
conda:
'../envs/biopythonworkbench.yaml'
script:
'../scripts/compareKmerCounts.py'
rule compareExpectedKmerProfileToObserved:
input:
trueCounts = 'data/auxiliary/'+config['input_folder']+'/kmers/{kmer}/{id}/alignment.counts.json',
expectedCounts = 'data/auxiliary/'+config['input_folder']+'/kmers/{kmer}/{id}/expected_counts.json',
groundTruthFile = 'data/input/' + config['ground_truth']
output:
differences = 'data/output/'+config['input_folder']+'/methodAnalysis/{id}/{kmer}/differences_observed_expected.txt'
params:
inputFileID = lambda wildcards: wildcards.id
conda:
'../envs/biopythonworkbench.yaml'
script:
'../scripts/compareKmerCounts_expected.py'
rule makeSequenceProfiles:
input:
sequences = 'data/auxiliary/spaSequences.fa'
output:
profiles = 'data/auxiliary/kmers/{kmer}/spaSequences.kmerprofiles.json',
counts = 'data/auxiliary/kmers/{kmer}/spaSequences.counts.json'
params:
k = lambda wildcards: wildcards.kmer,
# cluster execution
cpus = '1',
mem = '32G',
gpus = '0',
walltime = '00:30:00'
conda:
'../envs/biopythonworkbench.yaml'
script:
'../scripts/makeKmerProfiles.py'
rule calcAverageReadLength:
input:
read1 = 'data/input/'+config['input_folder']+'/{id}'+config['input_read_1_ending']
output:
'data/auxiliary/'+config['input_folder']+'/{id}/readLengthEstimate.txt'
conda:
'../envs/biopythonworkbench.yaml'
shell:
'awk \' {{ if(NR%4==2) {{count++; bases += length}} }} END{{print bases/count}} \' {input.read1} > {output}' #todo: use both read files?
rule createValidKmerHistogram:
input:
expected = 'data/auxiliary/'+config['input_folder']+'/kmers/{kmer}/{id}/expected_counts.json',
observed = 'data/auxiliary/'+config['input_folder']+'/kmers/{kmer}/{id}/alignment.counts.json'
output:
histogram = 'data/auxiliary/'+config['input_folder']+'/kmers/{kmer}/{id}/validKmersHisto.svg'
params:
gtt = lambda wildcards : getGroundTruthType(wildcards.id)
log:
'logs/'+config['input_folder']+'/probabilistic/kmers/{kmer}/{id}/validKmersHisto.log'
conda:
'../envs/biopythonworkbench.yaml'
script:
'../scripts/createValidKmerHistogram.py'
rule calcKmerErrorRates:
input:
baseError = 'data/auxiliary/'+config['input_folder']+'/{id}/base_error_estimate.txt'
output:
error = 'data/auxiliary/'+config['input_folder']+'/kmers/{kmer}/{id}/error_estimate.txt'
params:
k = lambda wildcards: wildcards.kmer
log:
'logs/'+config['input_folder']+'/kmers/{kmer}/{id}/calcKmerErrorRates.log'
conda:
'../envs/biopythonworkbench.yaml'
script:
'../scripts/calcKmerErrorRate.py'
if config['skipMapping']:
rule makeReadProfiles:
input:
expectedCounts = 'data/auxiliary/'+config['input_folder']+'/kmers/{kmer}/{id}/expected_counts.json',
read1 = 'data/auxiliary/' + config['input_folder'] + '/{id}' + '.qc' + config['input_read_1_ending'],
read2 = 'data/auxiliary/' + config['input_folder'] + '/{id}' + '.qc' + config['input_read_2_ending']
output:
profile = 'data/auxiliary/'+config['input_folder']+'/kmers/{kmer}/{id}/alignment.profile.json',
counts = 'data/auxiliary/'+config['input_folder']+'/kmers/{kmer}/{id}/alignment.counts.json'
params:
k = lambda wildcards: wildcards.kmer,
#cluster execution
cpus = '1',
mem = '32G',
gpus = '0',
walltime = '00:30:00'
log:
'logs/'+config['input_folder']+'/kmers/{kmer}/{id}/makeReadProfiles.log'
conda:
'../envs/biopythonworkbench.yaml'
script:
'../scripts/makeKmerProfilesFromFastq.py'
else:
rule makeReadProfiles:
input:
alignment = 'data/auxiliary/'+config['input_folder']+'/{id}/alignment.filtered.bam',
read1 = 'data/auxiliary/' + config['input_folder'] + '/{id}' + '.qc' + config['input_read_1_ending'],
read2 = 'data/auxiliary/' + config['input_folder'] + '/{id}' + '.qc' + config['input_read_2_ending'],
index = 'data/auxiliary/'+config['input_folder']+'/{id}/alignment.filtered.bam.bai', #ghost input
regions = 'data/auxiliary/syntheticProteinAs.meta'
output:
profile = 'data/auxiliary/'+config['input_folder']+'/kmers/{kmer}/{id}/alignment.profile.json',
counts = 'data/auxiliary/'+config['input_folder']+'/kmers/{kmer}/{id}/alignment.counts.json',
debug = 'data/auxiliary/'+config['input_folder']+'/kmers/{kmer}/{id}/alignmentExtraction.txt'
#local_coverage_estimate = 'data/auxiliary/kmers/{kmer}/{id}/local_coverage_estimate.txt'
params:
k = lambda wildcards: wildcards.kmer,
#cluster execution
cpus = '1',
mem = '32G',
gpus = '0',
walltime = '00:30:00'
log:
'logs/'+config['input_folder']+'/kmers/{kmer}/{id}/makeReadProfiles.log'
conda:
'../envs/biopythonworkbench.yaml'
script:
'../scripts/makeKmerProfileFromSam.py'
rule createRatios:
input:
'data/auxiliary/kmers/{kmer}/spaSequences.counts.json'
output:
'data/auxiliary/kmers/{kmer}/spaSequencesRatios.json'
params:
k = lambda wildcards: wildcards.kmer,
#cluster execution
cpus = '1',
mem = '8G',
gpus = '0',
walltime = '00:15:00'
conda:
'../envs/biopythonworkbench.yaml'
script:
'../scripts/calculateKmerRatios.py'
rule compareExpectedKmerProfileToTrueProfile:
input:
trueCounts = 'data/output/'+config['input_folder']+'/methodAnalysis/{id}/{kmer}/correctCounts.json',
expectedCounts = 'data/auxiliary/'+config['input_folder']+'/kmers/{kmer}/{id}/expected_counts.json',
groundTruthFile = 'data/input/' + config['ground_truth']
output:
differences = 'data/output/'+config['input_folder']+'/methodAnalysis/{id}/{kmer}/differences_expected.txt'
params:
inputFileID = lambda wildcards: wildcards.id
conda:
'../envs/biopythonworkbench.yaml'
script:
'../scripts/compareKmerCounts_expected.py'
rule calcPriorProbabilities:
input:
likelihoods = 'data/auxiliary/'+config['input_folder']+'/kmers/{kmer}/{id}/likelihoods.json'
output:
priorFilePath = 'data/auxiliary/'+config['input_folder']+'/kmers/{kmer}/{id}/prior.txt'
params:
k = lambda wildcards: wildcards.kmer,
dps = config['dps'],
cpus = '1',
mem = '1G',
gpus = '0',
walltime = '00:05:00'
conda:
'../envs/biopythonworkbench.yaml'
log:
'logs/'+config['input_folder']+'/probabilistic/kmers/{kmer}/{id}/calcPrior.log'
script:
'../scripts/calcPriorProbabilities.py'
rule calcProbabilities:
input:
likelihoods = 'data/auxiliary/'+config['input_folder']+'/kmers/{kmer}/{id}/likelihoods.json',
prior = 'data/auxiliary/'+config['input_folder']+'/kmers/{kmer}/{id}/prior.txt'
output:
probabilities = 'data/auxiliary/'+config['input_folder']+'/kmers/{kmer}/{id}/scores.probabilistic_gen.tsv'
params:
dps = config['dps'],
cpus = '1',
mem = '4G',
gpus = '0',
walltime = '00:05:00'
conda:
'../envs/biopythonworkbench.yaml'
log:
'logs/'+config['input_folder']+'/probabilistic/kmers/{kmer}/{id}/probabilities.log'
script:
'../scripts/calcSpaTypeProbabilities.py'
def extractTsvValue(filePath,line,nolabels=False):
with open(filePath,'r') as infile:
lines = infile.read().splitlines();
return lines[line].split('\t')[0] if nolabels else lines[line].split('\t')[1]
rule calcLikelihoods:
input:
expected = 'data/auxiliary/'+config['input_folder']+'/kmers/{kmer}/{id}/expected_counts.json',
observed = 'data/auxiliary/'+config['input_folder']+'/kmers/{kmer}/{id}/alignment.counts.json',
kmerError = 'data/auxiliary/'+config['input_folder']+'/kmers/{kmer}/{id}/kmer_error.txt',
kmerCoverageEstimate = determineKmerCoverageEstimateFile()
output:
likelihoods = 'data/auxiliary/'+config['input_folder']+'/kmers/{kmer}/{id}/likelihoods_cov.json',
unexpectedLikelihoods = 'data/auxiliary/'+config['input_folder']+'/kmers/{kmer}/{id}/unexpected_likelihoods_cov.json'
#diffs = 'data/auxiliary/kmers/{kmer}/{id}/kmer_diff.tsv'
log:
'logs/'+config['input_folder']+'/probabilistic/kmers/{kmer}/{id}/likelihoods_cov.log'
benchmark:
'benchmarks/'+config['input_folder']+'/probabilistic/kmers/{kmer}/{id}/calcLikelihoodsCoverageBasedModel.txt'
params:
e = (lambda wildcards,input : extractTsvValue(input.kmerError,0)),
deviationCutoff = (lambda wildcards,input : round(config['deviationCutoff']*extractCoverageEstimateFile(input.kmerCoverageEstimate,config))),
#cluster exectuion
cpus = '1',
mem = '4G',
gpus = '0',
walltime = '01:30:00'
singularity:
'docker://phspo/ckmertools:latest'
shell:
'c_kmertools --e {input.expected} --c {params.cpus} --m 0 --o {input.observed} --kmererror {params.e} --d {params.deviationCutoff} --target {output.likelihoods} --unexpected {output.unexpectedLikelihoods} --log {log}'
rule calcLikelihoods_Generative:
input:
counts = 'data/auxiliary/kmers/{kmer}/spaSequences.counts.json',
observed = 'data/auxiliary/'+config['input_folder']+'/kmers/{kmer}/{id}/alignment.counts.json',
baseError = 'data/auxiliary/'+config['input_folder']+'/{id}/base_error_estimate.txt'
output:
likelihoods = 'data/auxiliary/'+config['input_folder']+'/kmers/{kmer}/{id}/likelihoods.json'
params:
cpus = '1',
mem = '3G',
gpus = '0',
walltime = '24:00:00',
#cluster execution
k = lambda wildcards: wildcards.kmer,
e = lambda wildcards,input : extractTsvValue(input.baseError,0,True)
singularity:
'docker://phspo/ckmertools:latest'
log:
'logs/'+config['input_folder']+'/probabilistic/kmers/{kmer}/{id}/likelihoods.log'
benchmark:
'benchmarks/'+config['input_folder']+'/probabilistic/kmers/{kmer}/{id}/calcLikelihoodsGenerativeModel.txt'
shell:
'c_kmertools --p {input.counts} --m 1 --c {params.cpus} --o {input.observed} --baseerror {params.e} --k {params.k} --target {output.likelihoods} --log {log}'
rule estimateErrorRates:
input:
read1 = 'data/auxiliary/'+config['input_folder']+'/{id}'+'.qc'+config['input_read_1_ending'],
read2 = 'data/auxiliary/'+config['input_folder']+'/{id}'+'.qc'+config['input_read_2_ending']
output:
baseError = 'data/auxiliary/'+config['input_folder']+'/{id}/base_error_estimate.txt'
log:
'logs/'+config['input_folder']+'/kmers/{id}/estimateErrorRates.log'
params:
# cluster execution
cpus = '1',
mem = '32G',
gpus = '0',
walltime = '00:30:00'
conda:
'../envs/biopythonworkbench.yaml'
script:
'../scripts/estimateErrorRates.py'
'''
rule calcAverageCoverage:
input:
alignment = 'data/auxiliary/{id}/alignment.sorted.bam'
output:
'data/auxiliary/{id}/averageCoverage.depth'
conda:
'../envs/main.yaml'
shell:
'samtools mpileup -B -d 10000 -q0 -Q0 -r maskref {input.alignment} > {output}'
'''
####DEBUG RULES#####
rule calcKmerStats:
input:
expected = 'data/auxiliary/'+config['input_folder']+'/kmers/{kmer}/{id}/expected_counts.json',
observed = 'data/auxiliary/'+config['input_folder']+'/kmers/{kmer}/{id}/alignment.counts.json',
error = 'data/auxiliary/'+config['input_folder']+'/kmers/{kmer}/{id}/error_estimate.txt',
coverage_estimate = determineKmerCoverageEstimateFile()
output:
stats = 'data/auxiliary/'+config['input_folder']+'/kmers/{kmer}/{id}/stats.tsv'
params:
id = lambda wildcards: wildcards.id,