Commit cc4d7666 authored by Jan Hoeckesfeld's avatar Jan Hoeckesfeld
Browse files

added wildcard datasets

parent 0a533407
......@@ -9,29 +9,35 @@ validate(config, "schemas/config.schema.yaml")
#Generate Input/Output Files from specified folder
inputIDs, = glob_wildcards('data/input/'+config['input_folder']+'/{id}'+config['input_read_1_ending'])
kmer_lengths = config['kmers']
kmer_lengths = config['kmers']
dataset_inputIDs = {}
#kmer_lengths = [24]
possible_params = {'generative_model': expand('data/output/'+config['input_folder']+'/kmers/{kmer}/predictions.probabilistic_gen.tsv',kmer=kmer_lengths),
'probabilistic_model': expand('data/output/'+config['input_folder']+'/kmers/{kmer}/predictions.probabilistic_cov.tsv',kmer=kmer_lengths),
# if above:
'plot_top3_fit': expand('data/output/'+config['input_folder']+'/kmers/{kmer}/{id}_top3fit.svg',kmer=kmer_lengths,id=inputIDs),
'distance_model': expand('data/output/'+config['input_folder']+'/kmers/{kmer}/predictions.euclidean.tsv',kmer=kmer_lengths),
'assembly_model': expand('data/output/'+config['input_folder']+'/{id}/exactMatches.tsv',id=inputIDs),
'calc_strand_bias': expand('data/output/'+config['input_folder']+'/{id}/strandbias.txt',id=inputIDs),
'mapping_diff_analysis': expand('data/output/'+config['input_folder']+'/methodAnalysis/{id}/mapping.comparison',id=inputIDs),
'map_filtered_reads': expand('data/output/'+config['input_folder']+'/methodAnalysis/{id}/alignmentToGroundTruthType.sorted.bam.bai',id=inputIDs),
'verifyUniqueness': expand('data/output/kmers/{kmer}/uniquenessTest.tsv',kmer=kmer_lengths),
'kmer_stats_analysis': expand('data/auxiliary/'+config['input_folder']+'/kmers/{kmer}/{id}/stats.tsv',kmer=kmer_lengths,id=inputIDs) +
expand('data/output/'+config['input_folder']+'/kmers/{kmer}/{id}/spaTypesGroundTruthVennDia.svg',kmer=kmer_lengths,id=inputIDs)
}
def get_inputs(dataset):
run_dir = dataset
inputIDs, = glob_wildcards('data/input/'+dataset+'/{id}'+config["datasets"][dataset]['input_read_1_ending'])
dataset_inputIDs[dataset] = inputIDs
possible_params = {'generative_model': expand('data/output/'+run_dir+'/kmers/{kmer}/predictions.probabilistic_gen.tsv',kmer=kmer_lengths),
'probabilistic_model': expand('data/output/'+run_dir+'/kmers/{kmer}/predictions.probabilistic_cov.tsv',kmer=kmer_lengths),
# if above:
'plot_top3_fit': expand('data/output/'+run_dir+'/kmers/{kmer}/{id}_top3fit.svg',kmer=kmer_lengths,id=inputIDs),
'distance_model': expand('data/output/'+run_dir+'/kmers/{kmer}/predictions.euclidean.tsv',kmer=kmer_lengths),
'assembly_model': expand('data/output/'+run_dir+'/{id}/exactMatches.tsv',id=inputIDs),
'calc_strand_bias': expand('data/output/'+run_dir+'/{id}/strandbias.txt',id=inputIDs),
'mapping_diff_analysis': expand('data/output/'+run_dir+'/methodAnalysis/{id}/mapping.comparison',id=inputIDs),
'map_filtered_reads': expand('data/output/'+run_dir+'/methodAnalysis/{id}/alignmentToGroundTruthType.sorted.bam.bai',id=inputIDs),
'verifyUniqueness': expand('data/output/kmers/{kmer}/uniquenessTest.tsv',kmer=kmer_lengths),
'kmer_stats_analysis': expand('data/auxiliary/'+run_dir+'/kmers/{kmer}/{id}/stats.tsv',kmer=kmer_lengths,id=inputIDs) +
expand('data/output/'+run_dir+'/kmers/{kmer}/{id}/spaTypesGroundTruthVennDia.svg',kmer=kmer_lengths,id=inputIDs)
}
return [possible_params[k] for k in possible_params.keys() if config[k]]
rule all:
input:
[possible_params[k] for k in possible_params.keys() if config[k]]
[get_inputs(dataset) for dataset in config["datasets"].keys()]
##### load rules #####
......
......@@ -43,9 +43,13 @@ protein_a_identifier : protein A #NCTC8325
l_lactate_permease_identifier : L-lactate permease #NCTC8325
arlp_identifier : accessory regulator-like protein #NCTC8325
input_folder : test
input_read_1_ending : _1.fq
input_read_2_ending : _2.fq
datasets:
test:
input_read_1_ending : _1.fq
input_read_2_ending : _2.fq
test2:
input_read_1_ending : _R1.fastq
input_read_2_ending : _R2.fastq
### Method Analysis
......
rule spades:
input:
read1 = 'data/auxiliary/' + config['input_folder'] + '/{id}' + '.qc' + config['input_read_1_ending'],
read2 = 'data/auxiliary/' + config['input_folder'] + '/{id}' + '.qc' + config['input_read_2_ending']
read1 = 'data/auxiliary/{dataset}/{id}' + '.qc_internal_R1.fq',
read2 = 'data/auxiliary/{dataset}/{id}' + '.qc_internal_R2.fq'
output:
directory('data/auxiliary/' + config['input_folder'] + '/{id}/spades')
directory('data/auxiliary/{dataset}/{id}/spades')
singularity:
'docker://pegi3s/spades:latest'
shell:
......@@ -13,12 +13,12 @@ rule spades:
rule exactMatch:
input:
infolder = 'data/auxiliary/' + config['input_folder'] + '/{id}/spades'
infolder = 'data/auxiliary/{dataset}/{id}/spades'
output:
'data/output/' + config['input_folder'] + '/{id}/exactMatches.tsv'
'data/output/{dataset}/{id}/exactMatches.tsv'
params:
spaSeqs = 'data/auxiliary/spaSequences.fa',
scaffolds = 'data/auxiliary/' + config['input_folder'] + '/{id}/spades/scaffolds.fasta'
scaffolds = 'data/auxiliary/{dataset}/{id}/spades/scaffolds.fasta'
conda:
'../envs/biopythonworkbench.yaml'
script:
......
rule estimateKmerCoverage:
input:
read1 = 'data/auxiliary/'+config['input_folder']+'/{id}'+'.qc'+config['input_read_1_ending'],
read2 = 'data/auxiliary/'+config['input_folder']+'/{id}'+'.qc'+config['input_read_2_ending']
read1 = 'data/auxiliary/{dataset}/{id}'+'.qc_internal_R1.fq',
read2 = 'data/auxiliary/{dataset}/{id}'+'.qc_internal_R2.fq'
output:
histogram = report('data/output/' + config['input_folder'] + '/kmers/{kmer}/{id}/kmers.histo.png',category='Coverage-Based Method',caption='../report/kmerCoverageHistogram.rst'),
histogramRaw = 'data/auxiliary/' + config['input_folder'] + '/kmers/{kmer}/{id}/kmers.histo.raw.png',
mean = 'data/auxiliary/' + config['input_folder'] + '/kmers/{kmer}/{id}/coverage_estimate_kmercountbased.txt',
kmererror = 'data/auxiliary/' + config['input_folder'] + '/kmers/{kmer}/{id}/kmer_error.txt'
histogram = report('data/output/{dataset}/kmers/{kmer}/{id}/kmers.histo.png',category='Coverage-Based Method',caption='../report/kmerCoverageHistogram.rst'),
histogramRaw = 'data/auxiliary/{dataset}/kmers/{kmer}/{id}/kmers.histo.raw.png',
mean = 'data/auxiliary/{dataset}/kmers/{kmer}/{id}/coverage_estimate_kmercountbased.txt',
kmererror = 'data/auxiliary/{dataset}/kmers/{kmer}/{id}/kmer_error.txt'
params:
k = lambda wildcards: wildcards.kmer,
#cluster execution
......@@ -15,9 +15,9 @@ rule estimateKmerCoverage:
mem = '64G',
walltime = '00:45:00'
log:
'logs/' + config['input_folder'] + '/kmers/{kmer}/{id}/estimateKmerCoverage.log'
'logs/{dataset}/kmers/{kmer}/{id}/estimateKmerCoverage.log'
benchmark:
'benchmarks/' + config['input_folder'] + '/kmers/{kmer}/{id}/estimateKmerCoverage.log'
'benchmarks/{dataset}/kmers/{kmer}/{id}/estimateKmerCoverage.log'
conda:
'../envs/biopythonworkbench.yaml'
script:
......@@ -27,14 +27,14 @@ rule estimateKmerCoverage:
'''
rule estimateKmerCoverageFiltered:
input:
reads = 'data/auxiliary/'+config['input_folder']+'/{id}/filteredReads.fastq'
reads = 'data/auxiliary/{dataset}/{id}/filteredReads.fastq'
output:
histogram = 'data/auxiliary/'+config['input_folder']+'/kmers/{kmer}/{id}/kmers.histo.regionXOnly.png'
histogram = 'data/auxiliary/{dataset}/kmers/{kmer}/{id}/kmers.histo.regionXOnly.png'
params:
k = lambda wildcards: wildcards.kmer
#TODO: Threads = 2 ?
log:
'logs/'+config['input_folder']+'/kmers/{kmer}/{id}/estimateKmerCoverageFiltered.log'
'logs/{dataset}/kmers/{kmer}/{id}/estimateKmerCoverageFiltered.log'
conda:
'../envs/main.yaml'
script:
......@@ -44,11 +44,11 @@ rule estimateKmerCoverageFiltered:
rule estimateKmerCoverageAlignment:
input:
coverageEstimate = 'data/auxiliary/'+config['input_folder']+'/{id}/coverageEstimate.txt',
readLengthEstimate = 'data/auxiliary/'+config['input_folder']+'/{id}/readLengthEstimate.txt',
baseErrorEstimate = 'data/auxiliary/'+config['input_folder']+'/{id}/base_error_estimate.txt'
coverageEstimate = 'data/auxiliary/{dataset}/{id}/coverageEstimate.txt',
readLengthEstimate = 'data/auxiliary/{dataset}/{id}/readLengthEstimate.txt',
baseErrorEstimate = 'data/auxiliary/{dataset}/{id}/base_error_estimate.txt'
output:
kmerCoverage = 'data/auxiliary/'+config['input_folder']+'/kmers/{kmer}/{id}/coverage_estimate_alignmentbased.txt'
kmerCoverage = 'data/auxiliary/{dataset}/kmers/{kmer}/{id}/coverage_estimate_alignmentbased.txt'
params:
k = lambda wildcards: wildcards.kmer,
# cluster execution
......@@ -57,7 +57,7 @@ rule estimateKmerCoverageAlignment:
mem = '16G',
walltime = '00:30:30'
log:
'logs/'+config['input_folder']+'/kmers/{kmer}/{id}/estimateKmerCoverage.log'
'logs/{dataset}/kmers/{kmer}/{id}/estimateKmerCoverage.log'
conda:
'../envs/biopythonworkbench.yaml'
script:
......@@ -65,9 +65,9 @@ rule estimateKmerCoverageAlignment:
rule estimateCoverageAlignment:
input:
filteredAlignment = 'data/auxiliary/'+config['input_folder']+'/{id}/alignment.sorted.bam'
filteredAlignment = 'data/auxiliary/{dataset}/{id}/alignment.sorted.bam'
output:
coverageEstimate = 'data/auxiliary/'+config['input_folder']+'/{id}/coverageEstimate.txt'
coverageEstimate = 'data/auxiliary/{dataset}/{id}/coverageEstimate.txt'
params:
# cluster execution
cpus = '1',
......@@ -75,7 +75,7 @@ rule estimateCoverageAlignment:
mem = '16G',
walltime = '00:30:30'
log:
'logs/'+config['input_folder']+'/{id}/estimateKmerCoverage_alignment.log'
'logs/{dataset}/{id}/estimateKmerCoverage_alignment.log'
conda:
'../envs/biopythonworkbench.yaml'
shell:
......@@ -83,9 +83,9 @@ rule estimateCoverageAlignment:
rule calcPriorProbabilitiesCoverage:
input:
likelihoods = 'data/auxiliary/'+config['input_folder']+'/kmers/{kmer}/{id}/likelihoods_cov.json'
likelihoods = 'data/auxiliary/{dataset}/kmers/{kmer}/{id}/likelihoods_cov.json'
output:
priorFilePath = 'data/auxiliary/'+config['input_folder']+'/kmers/{kmer}/{id}/prior_cov.txt'
priorFilePath = 'data/auxiliary/{dataset}/kmers/{kmer}/{id}/prior_cov.txt'
params:
k = lambda wildcards: wildcards.kmer,
dps = config['dps'],
......@@ -95,7 +95,7 @@ rule calcPriorProbabilitiesCoverage:
mem = '2G',
walltime = '00:05:30'
log:
'logs/'+config['input_folder']+'/probabilistic/kmers/{kmer}/{id}/calcPrior_cov.log'
'logs/{dataset}/probabilistic/kmers/{kmer}/{id}/calcPrior_cov.log'
conda:
'../envs/biopythonworkbench.yaml'
script:
......@@ -103,10 +103,10 @@ rule calcPriorProbabilitiesCoverage:
rule calcProbabilitiesCoverage:
input:
likelihoods = 'data/auxiliary/'+config['input_folder']+'/kmers/{kmer}/{id}/likelihoods_cov.json',
prior = 'data/auxiliary/'+config['input_folder']+'/kmers/{kmer}/{id}/prior_cov.txt'
likelihoods = 'data/auxiliary/{dataset}/kmers/{kmer}/{id}/likelihoods_cov.json',
prior = 'data/auxiliary/{dataset}/kmers/{kmer}/{id}/prior_cov.txt'
output:
probabilities = 'data/auxiliary/'+config['input_folder']+'/kmers/{kmer}/{id}/scores.probabilistic_cov.tsv'
probabilities = 'data/auxiliary/{dataset}/kmers/{kmer}/{id}/scores.probabilistic_cov.tsv'
params:
dps = config['dps'],
# cluster execution
......@@ -117,18 +117,18 @@ rule calcProbabilitiesCoverage:
conda:
'../envs/biopythonworkbench.yaml'
log:
'logs/'+config['input_folder']+'/probabilistic/kmers/{kmer}/{id}/probabilities_cov.log'
'logs/{dataset}/probabilistic/kmers/{kmer}/{id}/probabilities_cov.log'
script:
'../scripts/calcSpaTypeProbabilities.py'
rule createFitnessPlots:
input:
counts = 'data/auxiliary/'+config['input_folder']+'/kmers/{kmer}/{id}/alignment.counts.json',
probabilities = 'data/auxiliary/'+config['input_folder']+'/kmers/{kmer}/{id}/scores.probabilistic_cov.tsv',
counts = 'data/auxiliary/{dataset}/kmers/{kmer}/{id}/alignment.counts.json',
probabilities = 'data/auxiliary/{dataset}/kmers/{kmer}/{id}/scores.probabilistic_cov.tsv',
ratios = 'data/auxiliary/kmers/{kmer}/spaSequencesRatios.json'
output:
report('data/output/'+config['input_folder']+'/kmers/{kmer}/{id}_top3fit.svg',category='Coverage-Based Method',caption='../report/fitnessPlot.rst')
report('data/output/{dataset}/kmers/{kmer}/{id}_top3fit.svg',category='Coverage-Based Method',caption='../report/fitnessPlot.rst')
params:
dps = config['dps'],
# cluster execution
......@@ -147,7 +147,7 @@ rule calcExpectedCounts:
kmerCoverageEstimate = determineKmerCoverageEstimateFile(),
counts = 'data/auxiliary/kmers/{kmer}/spaSequences.counts.json'
output:
'data/auxiliary/'+config['input_folder']+'/kmers/{kmer}/{id}/expected_counts.json'
'data/auxiliary/{dataset}/kmers/{kmer}/{id}/expected_counts.json'
params:
k = lambda wildcards: wildcards.kmer,
# cluster execution
......
rule distance:
input:
readProfile = 'data/auxiliary/'+config['input_folder']+'/kmers/{kmer}/{id}/alignment.profile.json',
readProfile = 'data/auxiliary/{dataset}/kmers/{kmer}/{id}/alignment.profile.json',
spaProfiles = 'data/auxiliary/kmers/{kmer}/spaSequences.kmerprofiles.json'
output:
'data/auxiliary/'+config['input_folder']+'/kmers/{kmer}/{id}/scores.euclidean.tsv'
'data/auxiliary/{dataset}/kmers/{kmer}/{id}/scores.euclidean.tsv'
conda:
'../envs/biopythonworkbench.yaml'
params:
......
# Returns the correct filename in which the required information is stored depending on the configuration setting
def determineKmerCoverageEstimateFile():
if config['kmerCoverageEstimationMethod'] == 'alignment':
return 'data/auxiliary/'+config['input_folder']+'/kmers/{kmer}/{id}/coverage_estimate_alignmentbased.txt'
return 'data/auxiliary/{dataset}/kmers/{kmer}/{id}/coverage_estimate_alignmentbased.txt'
elif config['kmerCoverageEstimationMethod'] == 'countMean':
return 'data/auxiliary/'+config['input_folder']+'/kmers/{kmer}/{id}/coverage_estimate_kmercountbased.txt'
return 'data/auxiliary/{dataset}/kmers/{kmer}/{id}/coverage_estimate_kmercountbased.txt'
elif config['kmerCoverageEstimationMethod'] == 'countPoisson':
return 'data/auxiliary/'+config['input_folder']+'/kmers/{kmer}/{id}/coverage_estimate_kmercountbased.txt'
return 'data/auxiliary/{dataset}/kmers/{kmer}/{id}/coverage_estimate_kmercountbased.txt'
# Returns the corresponding ground truth spa-type for a given file (sample) id
def getGroundTruthType(fid):
......@@ -75,10 +75,10 @@ rule bwa:
input:
bwi = 'data/auxiliary/matchBoard.fa.bwt',
mb = 'data/auxiliary/matchBoard.fa',
read1 = 'data/auxiliary/' + config['input_folder'] + '/{id}' + '.qc' + config['input_read_1_ending'],
read2 = 'data/auxiliary/' + config['input_folder'] + '/{id}' + '.qc' + config['input_read_2_ending']
read1 = 'data/auxiliary/{dataset}/{id}' + '.qc_internal_R1.fq',
read2 = 'data/auxiliary/{dataset}/{id}' + '.qc_internal_R2.fq'
output:
'data/auxiliary/'+config['input_folder']+'/{id}/alignment.bam'
'data/auxiliary/{dataset}/{id}/alignment.bam'
params:
#cluster execution
cpus = '1',
......@@ -95,10 +95,10 @@ rule bwa:
rule determineStrandBias:
input:
alignment = 'data/auxiliary/'+config['input_folder']+'/{id}/alignment.sorted.bam',
idx = 'data/auxiliary/'+config['input_folder']+'/{id}/alignment.sorted.bam.bai'
alignment = 'data/auxiliary/{dataset}/{id}/alignment.sorted.bam',
idx = 'data/auxiliary/{dataset}/{id}/alignment.sorted.bam.bai'
output:
strandbias = report('data/output/'+config['input_folder']+'/{id}/strandbias.txt',category='Strand Bias')
strandbias = report('data/output/{dataset}/{id}/strandbias.txt',category='Strand Bias')
params:
#cluster execution
cpus = '1',
......@@ -114,10 +114,10 @@ rule determineStrandBias:
rule filter_primary_matches:
input:
alignment = 'data/auxiliary/'+config['input_folder']+'/{id}/alignment.sorted.bam',
alignment = 'data/auxiliary/{dataset}/{id}/alignment.sorted.bam',
whitelist = 'data/auxiliary/syntheticProteinAs.meta'
output:
'data/auxiliary/'+config['input_folder']+'/{id}/alignment.filtered.bam'
'data/auxiliary/{dataset}/{id}/alignment.filtered.bam'
params:
#cluster execution
cpus = '1',
......@@ -131,10 +131,10 @@ rule filter_primary_matches:
rule determineFilteredStrandBias:
input:
alignment = 'data/auxiliary/'+config['input_folder']+'/{id}/alignment.filtered.bam',
idx = 'data/auxiliary/'+config['input_folder']+'/{id}/alignment.filtered.bam.bai'
alignment = 'data/auxiliary/{dataset}/{id}/alignment.filtered.bam',
idx = 'data/auxiliary/{dataset}/{id}/alignment.filtered.bam.bai'
output:
strandbias = 'data/auxiliary/'+config['input_folder']+'/{id}/strandbias.filtered.txt'
strandbias = 'data/auxiliary/{dataset}/{id}/strandbias.filtered.txt'
params:
#cluster execution
cpus = '1',
......@@ -148,9 +148,9 @@ rule determineFilteredStrandBias:
rule extractFilteredReadsAsFastQ:
input:
filteredAlignment = 'data/auxiliary/'+config['input_folder']+'/{id}/alignment.filtered.bam'
filteredAlignment = 'data/auxiliary/{dataset}/{id}/alignment.filtered.bam'
output:
filteredReads = 'data/auxiliary/'+config['input_folder']+'/{id}/filteredReads.fastq'
filteredReads = 'data/auxiliary/{dataset}/{id}/filteredReads.fastq'
params:
#cluster execution
cpus = '1',
......@@ -167,13 +167,13 @@ rule extractFilteredReadsAsFastQ:
rule createKmerDistributionGroundTruth_COVERAGE_BASED:
input:
expectedCounts = 'data/auxiliary/'+config['input_folder']+'/kmers/{kmer}/{id}/expected_counts.json',
observedCounts = 'data/auxiliary/'+config['input_folder']+'/kmers/{kmer}/{id}/alignment.counts.json',
probabilities = 'data/auxiliary/'+config['input_folder']+'/kmers/{kmer}/{id}/scores.probabilistic_cov.tsv',
kmerError = 'data/auxiliary/'+config['input_folder']+'/kmers/{kmer}/{id}/kmer_error.txt'
expectedCounts = 'data/auxiliary/{dataset}/kmers/{kmer}/{id}/expected_counts.json',
observedCounts = 'data/auxiliary/{dataset}/kmers/{kmer}/{id}/alignment.counts.json',
probabilities = 'data/auxiliary/{dataset}/kmers/{kmer}/{id}/scores.probabilistic_cov.tsv',
kmerError = 'data/auxiliary/{dataset}/kmers/{kmer}/{id}/kmer_error.txt'
output:
errors = 'data/output/'+config['input_folder']+'/methodAnalysis/{kmer}/{id}/kmerErrorDistributions.svg',
deviations = 'data/output/'+config['input_folder']+'/methodAnalysis/{kmer}/{id}/countDeviations.svg'
errors = 'data/output/{dataset}/methodAnalysis/{kmer}/{id}/kmerErrorDistributions.svg',
deviations = 'data/output/{dataset}/methodAnalysis/{kmer}/{id}/countDeviations.svg'
params:
gt = lambda wildcards : getGroundTruthType(wildcards.id),
#cluster execution
......@@ -184,35 +184,35 @@ rule createKmerDistributionGroundTruth_COVERAGE_BASED:
conda:
'../envs/biopythonworkbench.yaml'
log:
'logs/'+config['input_folder']+'/methodAnalysis/{kmer}/{id}/kmerErrorDistributions.svg'
'logs/{dataset}/methodAnalysis/{kmer}/{id}/kmerErrorDistributions.svg'
script:
'../scripts/createKmerErrorDistributionPlots.py'
rule likelihoodAnalysis_COVERAGE_BASED:
input:
expectedCounts = 'data/auxiliary/'+config['input_folder']+'/kmers/{kmer}/{id}/expected_counts.json',
observedCounts = 'data/auxiliary/'+config['input_folder']+'/kmers/{kmer}/{id}/alignment.counts.json',
probabilities = 'data/auxiliary/'+config['input_folder']+'/kmers/{kmer}/{id}/scores.probabilistic_cov.tsv',
kmerError = 'data/auxiliary/'+config['input_folder']+'/kmers/{kmer}/{id}/kmer_error.txt'
expectedCounts = 'data/auxiliary/{dataset}/kmers/{kmer}/{id}/expected_counts.json',
observedCounts = 'data/auxiliary/{dataset}/kmers/{kmer}/{id}/alignment.counts.json',
probabilities = 'data/auxiliary/{dataset}/kmers/{kmer}/{id}/scores.probabilistic_cov.tsv',
kmerError = 'data/auxiliary/{dataset}/kmers/{kmer}/{id}/kmer_error.txt'
output:
likelihoodAnalysis = 'data/output/'+config['input_folder']+'/methodAnalysis/{kmer}/{id}/likelihoodAnalysis.txt'
likelihoodAnalysis = 'data/output/{dataset}/methodAnalysis/{kmer}/{id}/likelihoodAnalysis.txt'
params:
gt = lambda wildcards : getGroundTruthType(wildcards.id)
conda:
'../envs/biopythonworkbench.yaml'
log:
'logs/'+config['input_folder']+'/methodAnalysis/{kmer}/{id}/likelihoodAnalysis.txt'
'logs/{dataset}/methodAnalysis/{kmer}/{id}/likelihoodAnalysis.txt'
script:
'../scripts/likelihoodBreakdown.py'
rule mapAgainstGroundTruth:
input:
filteredReads = 'data/auxiliary/'+config['input_folder']+'/{id}/filteredReads.fastq',
filteredReads = 'data/auxiliary/{dataset}/{id}/filteredReads.fastq',
groundTruthSequence = lambda wildcards: 'data/input/ref/'+getGroundTruthType(wildcards.id)+'.fa',
groundTruthIndex = lambda wildcards: 'data/input/ref/'+getGroundTruthType(wildcards.id)+'.fa.bwt'
output:
'data/output/'+config['input_folder']+'/methodAnalysis/{id}/alignmentToGroundTruthType.bam'
'data/output/{dataset}/methodAnalysis/{id}/alignmentToGroundTruthType.bam'
params:
# cluster execution
cpus = '1',
......@@ -244,14 +244,14 @@ rule verifyUniqueness:
rule analyzeMapping:
input:
read1 = 'data/auxiliary/'+config['input_folder']+'/{id}'+'.qc'+config['input_read_1_ending'],
read2 = 'data/auxiliary/'+config['input_folder']+'/{id}'+'.qc'+config['input_read_2_ending'],
filteredAlignment = 'data/auxiliary/'+config['input_folder']+'/{id}/alignment.filtered.bam',
idx = 'data/auxiliary/'+config['input_folder']+'/{id}/alignment.filtered.bam.bai',
metaInf = lambda wildcards : 'data/input/'+config['input_folder']+'/syntheticReferencesMetaData/'+getGroundTruthType(wildcards.id)+'.meta'
read1 = 'data/auxiliary/{dataset}/{id}'+'.qc_internal_R1.fq',
read2 = 'data/auxiliary/{dataset}/{id}'+'.qc_internal_R2.fq',
filteredAlignment = 'data/auxiliary/{dataset}/{id}/alignment.filtered.bam',
idx = 'data/auxiliary/{dataset}/{id}/alignment.filtered.bam.bai',
metaInf = lambda wildcards : 'data/input/{dataset}/syntheticReferencesMetaData/'+getGroundTruthType(wildcards.id)+'.meta'
output:
correctAlignments = 'data/output/'+config['input_folder']+'/methodAnalysis/{id}/correctMapping.fa',
analysis = 'data/output/'+config['input_folder']+'/methodAnalysis/{id}/mapping.comparison'
correctAlignments = 'data/output/{dataset}/methodAnalysis/{id}/correctMapping.fa',
analysis = 'data/output/{dataset}/methodAnalysis/{id}/mapping.comparison'
conda:
'../envs/biopythonworkbench.yaml'
params:
......@@ -265,11 +265,11 @@ rule analyzeMapping:
rule makeKmerProfilesFromTrueReads:
input:
filteredReads = 'data/output/'+config['input_folder']+'/methodAnalysis/{id}/correctMapping.fa',
regionXMetaData = lambda wildcards : 'data/input/'+config['input_folder']+'/syntheticReferencesMetaData/'+getGroundTruthType(wildcards.id)+'.meta'
filteredReads = 'data/output/{dataset}/methodAnalysis/{id}/correctMapping.fa',
regionXMetaData = lambda wildcards : 'data/input/{dataset}/syntheticReferencesMetaData/'+getGroundTruthType(wildcards.id)+'.meta'
output:
counts = 'data/output/'+config['input_folder']+'/methodAnalysis/{id}/{kmer}/correctCounts.json',
origins = 'data/output/'+config['input_folder']+'/methodAnalysis/{id}/{kmer}/kmerOrigins.json'
counts = 'data/output/{dataset}/methodAnalysis/{id}/{kmer}/correctCounts.json',
origins = 'data/output/{dataset}/methodAnalysis/{id}/{kmer}/kmerOrigins.json'
params:
k = lambda wildcards: wildcards.kmer
conda:
......@@ -279,10 +279,10 @@ rule makeKmerProfilesFromTrueReads:
rule compareObservedKmerProfileToTrueProfile:
input:
trueCounts = 'data/output/'+config['input_folder']+'/methodAnalysis/{id}/{kmer}/correctCounts.json',
observedCounts = 'data/auxiliary/'+config['input_folder']+'/kmers/{kmer}/{id}/alignment.counts.json'
trueCounts = 'data/output/{dataset}/methodAnalysis/{id}/{kmer}/correctCounts.json',
observedCounts = 'data/auxiliary/{dataset}/kmers/{kmer}/{id}/alignment.counts.json'
output:
differences = 'data/output/'+config['input_folder']+'/methodAnalysis/{id}/{kmer}/differences_observed.txt'
differences = 'data/output/{dataset}/methodAnalysis/{id}/{kmer}/differences_observed.txt'
conda:
'../envs/biopythonworkbench.yaml'
script:
......@@ -290,11 +290,11 @@ rule compareObservedKmerProfileToTrueProfile:
rule compareExpectedKmerProfileToObserved:
input:
trueCounts = 'data/auxiliary/'+config['input_folder']+'/kmers/{kmer}/{id}/alignment.counts.json',
expectedCounts = 'data/auxiliary/'+config['input_folder']+'/kmers/{kmer}/{id}/expected_counts.json',
trueCounts = 'data/auxiliary/{dataset}/kmers/{kmer}/{id}/alignment.counts.json',
expectedCounts = 'data/auxiliary/{dataset}/kmers/{kmer}/{id}/expected_counts.json',
groundTruthFile = 'data/input/' + config['ground_truth']
output:
differences = 'data/output/'+config['input_folder']+'/methodAnalysis/{id}/{kmer}/differences_observed_expected.txt'
differences = 'data/output/{dataset}/methodAnalysis/{id}/{kmer}/differences_observed_expected.txt'
params:
inputFileID = lambda wildcards: wildcards.id
conda:
......@@ -324,9 +324,9 @@ rule makeSequenceProfiles:
rule calcAverageReadLength:
input:
read1 = 'data/input/'+config['input_folder']+'/{id}'+config['input_read_1_ending']
read1 = lambda wildcards: 'data/input/{dataset}/{id}'+ config['datasets'][wildcards.dataset]['input_read_1_ending']
output:
'data/auxiliary/'+config['input_folder']+'/{id}/readLengthEstimate.txt'
'data/auxiliary/{dataset}/{id}/readLengthEstimate.txt'
conda:
'../envs/biopythonworkbench.yaml'
shell:
......@@ -335,17 +335,17 @@ rule calcAverageReadLength:
rule createSpaTypeVennDiagram:
input:
expected = 'data/auxiliary/'+config['input_folder']+'/kmers/{kmer}/{id}/expected_counts.json',
observed = 'data/auxiliary/'+config['input_folder']+'/kmers/{kmer}/{id}/alignment.counts.json',
scores = 'data/auxiliary/'+config['input_folder']+'/kmers/{kmer}/{id}/scores.probabilistic_cov.tsv'
expected = 'data/auxiliary/{dataset}/kmers/{kmer}/{id}/expected_counts.json',
observed = 'data/auxiliary/{dataset}/kmers/{kmer}/{id}/alignment.counts.json',
scores = 'data/auxiliary/{dataset}/kmers/{kmer}/{id}/scores.probabilistic_cov.tsv'
output:
venngtt = 'data/output/'+config['input_folder']+'/kmers/{kmer}/{id}/spaTypesGroundTruthVennDia.svg',
venntopsix = 'data/output/'+config['input_folder']+'/kmers/{kmer}/{id}/spaTypesTopSixVennDia.svg',
vennrandomsix = 'data/output/'+config['input_folder']+'/kmers/{kmer}/{id}/spaTypesRandomSixVennDia.svg'
venngtt = 'data/output/{dataset}/kmers/{kmer}/{id}/spaTypesGroundTruthVennDia.svg',
venntopsix = 'data/output/{dataset}/kmers/{kmer}/{id}/spaTypesTopSixVennDia.svg',
vennrandomsix = 'data/output/{dataset}/kmers/{kmer}/{id}/spaTypesRandomSixVennDia.svg'
params:
gtt = lambda wildcards : getGroundTruthType(wildcards.id)
log:
'logs/'+config['input_folder']+'/probabilistic/kmers/{kmer}/{id}/spaTypeVennDia.log'
'logs/{dataset}/probabilistic/kmers/{kmer}/{id}/spaTypeVennDia.log'
conda:
'../envs/biopythonworkbench.yaml'
script:
......@@ -354,15 +354,15 @@ rule createSpaTypeVennDiagram:
rule createValidKmerHistogram:
input:
expected = 'data/auxiliary/'+config['input_folder']+'/kmers/{kmer}/{id}/expected_counts.json',
observed = 'data/auxiliary/'+config['input_folder']+'/kmers/{kmer}/{id}/alignment.counts.json'
expected = 'data/auxiliary/{dataset}/kmers/{kmer}/{id}/expected_counts.json',
observed = 'data/auxiliary/{dataset}/kmers/{kmer}/{id}/alignment.counts.json'
output:
histogram = 'data/auxiliary/'+config['input_folder']+'/kmers/{kmer}/{id}/validKmersHisto.svg'
histogram = 'data/auxiliary/{dataset}/kmers/{kmer}/{id}/validKmersHisto.svg'
params:
gtt = lambda wildcards : getGroundTruthType(wildcards.id)
log:
'logs/'+config['input_folder']+'/probabilistic/kmers/{kmer}/{id}/validKmersHisto.log'
'logs/{dataset}/probabilistic/kmers/{kmer}/{id}/validKmersHisto.log'
conda:
'../envs/biopythonworkbench.yaml'
script:
......@@ -371,13 +371,13 @@ rule createValidKmerHistogram:
rule calcKmerErrorRates:
input:
baseError = 'data/auxiliary/'+config['input_folder']+'/{id}/base_error_estimate.txt'
baseError = 'data/auxiliary/{dataset}/{id}/base_error_estimate.txt'
output:
error = 'data/auxiliary/'+config['input_folder']+'/kmers/{kmer}/{id}/error_estimate.txt'
error = 'data/auxiliary/{dataset}/kmers/{kmer}/{id}/error_estimate.txt'
params:
k = lambda wildcards: wildcards.kmer
log:
'logs/'+config['input_folder']+'/kmers/{kmer}/{id}/calcKmerErrorRates.log'
'logs/{dataset}/kmers/{kmer}/{id}/calcKmerErrorRates.log'
conda:
'../envs/biopythonworkbench.yaml'
script:
......@@ -387,12 +387,12 @@ rule calcKmerErrorRates:
if config['skipMapping']:
rule makeReadProfiles:
input:
expectedCounts = 'data/auxiliary/'+config['input_folder']+'/kmers/{kmer}/{id}/expected_counts.json',
read1 = 'data/auxiliary/' + config['input_folder'] + '/{id}' + '.qc' + config['input_read_1_ending'],
read2 = 'data/auxiliary/' + config['input_folder'] + '/{id}' + '.qc' + config['input_read_2_ending']
expectedCounts = 'data/auxiliary/{dataset}/kmers/{kmer}/{id}/expected_counts.json',
read1 = 'data/auxiliary/{dataset}/{id}' + '.qc_internal_R1.fq',
read2 = 'data/auxiliary/{dataset}/{id}' + '.qc_internal_R2.fq'
output:
profile = 'data/auxiliary/'+config['input_folder']+'/kmers/{kmer}/{id}/alignment.profile.json',
counts = 'data/auxiliary/'+config['input_folder']+'/kmers/{kmer}/{id}/alignment.counts.json'
profile = 'data/auxiliary/{dataset}/kmers/{kmer}/{id}/alignment.profile.json',
counts = 'data/auxiliary/{dataset}/kmers/{kmer}/{id}/alignment.counts.json'
params:
k = lambda wildcards: wildcards.kmer,
#cluster execution
......@@ -401,7 +401,7 @@ if config['skipMapping']:
gpus = '0',
walltime = '00:30:00'
log:
'logs/'+config['input_folder']+'/kmers/{kmer}/{id}/makeReadProfiles.log'
'logs/{dataset}/kmers/{kmer}/{id}/makeReadProfiles.log'
conda:
'../envs/biopythonworkbench.yaml'
script:
......@@ -410,15 +410,15 @@ if config['skipMapping']:
else:
rule makeReadProfiles:
input:
alignment = 'data/auxiliary/'+config['input_folder']+'/{id}/alignment.filtered.bam',
read1 = 'data/auxiliary/' + config['input_folder'] + '/{id}' + '.qc' + config['input_read_1_ending'],
read2 = 'data/auxiliary/' + config['input_folder'] + '/{id}' + '.qc' + config['input_read_2_ending'],
index = 'data/auxiliary/'+config['input_folder']+'/{id}/alignment.filtered.bam.bai', #ghost input
alignment = 'data/auxiliary/{dataset}/{id}/alignment.filtered.bam',
read1 = 'data/auxiliary/{dataset}/{id}' + '.qc_internal_R1.fq',
read2 = 'data/auxiliary/{dataset}/{id}' + '.qc_internal_R2.fq',
index = 'data/auxiliary/{dataset}/{id}/alignment.filtered.bam.bai', #ghost input
regions = 'data/auxiliary/syntheticProteinAs.meta'
output:
profile = 'data/auxiliary/'+config['input_folder']+'/kmers/{kmer}/{id}/alignment.profile.json',
counts = 'data/auxiliary/'+config['input_folder']+'/kmers/{kmer}/{id}/alignment.counts.json',
debug = 'data/auxiliary/'+config['input_folder']+'/kmers/{kmer}/{id}/alignmentExtraction.txt'
profile = 'data/auxiliary/{dataset}/kmers/{kmer}/{id}/alignment.profile.json',
counts = 'data/auxiliary/{dataset}/kmers/{kmer}/{id}/alignment.counts.json',
debug = 'data/auxiliary/{dataset}/kmers/{kmer}/{id}/alignmentExtraction.txt'
#local_coverage_estimate = 'data/auxiliary/kmers/{kmer}/{id}/local_coverage_estimate.txt'
params:
k = lambda wildcards: wildcards.kmer,
......@@ -428,7 +428,7 @@ else:
gpus = '0',
walltime = '00:30:00'
log:
'logs/'+config['input_folder']+'/kmers/{kmer}/{id}/makeReadProfiles.log'
'logs/{dataset}/kmers/{kmer}/{id}/makeReadProfiles.log'
conda:
'../envs/biopythonworkbench.yaml'
script:
......
rule compareExpectedKmerProfileToTrueProfile:
input:
trueCounts = 'data/output/'+config['input_folder']+'/methodAnalysis/{id}/{kmer}/correctCounts.json',
expectedCounts = 'data/auxiliary/'+config['input_folder']+'/kmers/{kmer}/{id}/expected_counts.json',
trueCounts = 'data/output/{dataset}/methodAnalysis/{id}/{kmer}/correctCounts.json',
expectedCounts = 'data/auxiliary/{dataset}/kmers/{kmer}/{id}/expected_counts.json',
groundTruthFile = 'data/input/' + config['ground_truth']
output:
differences = 'data/output/'+config['input_folder']+'/methodAnalysis/{id}/{kmer}/differences_expected.txt'
differences = 'data/output/{dataset}/methodAnalysis/{id}/{kmer}/differences_expected.txt'
params:
inputFileID = lambda wildcards: wildcards.id
conda:
......@@ -14,9 +14,9 @@ rule compareExpectedKmerProfileToTrueProfile:
rule calcPriorProbabilities:
input:
likelihoods = 'data/auxiliary/'+config['input_folder']+'/kmers/{kmer}/{id}/likelihoods.json'
likelihoods = 'data/auxiliary/{dataset}/kmers/{kmer}/{id}/likelihoods.json'
output:
priorFilePath = 'data/auxiliary/'+config['input_folder']+'/kmers/{kmer}/{id}/prior.txt'
priorFilePath = 'data/auxiliary/{dataset}/kmers/{kmer}/{id}/prior.txt'
params:
k = lambda wildcards: wildcards.kmer,
dps = config['dps'],
......@@ -27,17 +27,17 @@ rule calcPriorProbabilities:
conda:
'../envs/biopythonworkbench.yaml'
log:
'logs/'+config['input_folder']+'/probabilistic/kmers/{kmer}/{id}/calcPrior.log'
'logs/{dataset}/probabilistic/kmers/{kmer}/{id}/calcPrior.log'
script:
'../scripts/calcPriorProbabilities.py'
rule calcProbabilities:
input:
likelihoods = 'data/auxiliary/'+config['input_folder']+'/kmers/{kmer}/{id}/likelihoods.json',
prior = 'data/auxiliary/'+config['input_folder']+'/kmers/{kmer}/{id}/prior.txt'
likelihoods = 'data/auxiliary/{dataset}/kmers/{kmer}/{id}/likelihoods.json',
prior = 'data/auxiliary/{dataset}/kmers/{kmer}/{id}/prior.txt'
output:
probabilities = 'data/auxiliary/'+config['input_folder']+'/kmers/{kmer}/{id}/scores.probabilistic_gen.tsv'
probabilities = 'data/auxiliary/{dataset}/kmers/{kmer}/{id}/scores.probabilistic_gen.tsv'