Commit cc4d7666 authored by Jan Hoeckesfeld's avatar Jan Hoeckesfeld
Browse files

added wildcard datasets

parent 0a533407
...@@ -9,29 +9,35 @@ validate(config, "schemas/config.schema.yaml") ...@@ -9,29 +9,35 @@ validate(config, "schemas/config.schema.yaml")
#Generate Input/Output Files from specified folder #Generate Input/Output Files from specified folder
inputIDs, = glob_wildcards('data/input/'+config['input_folder']+'/{id}'+config['input_read_1_ending'])
kmer_lengths = config['kmers']
kmer_lengths = config['kmers']
dataset_inputIDs = {}
#kmer_lengths = [24] #kmer_lengths = [24]
def get_inputs(dataset):
possible_params = {'generative_model': expand('data/output/'+config['input_folder']+'/kmers/{kmer}/predictions.probabilistic_gen.tsv',kmer=kmer_lengths), run_dir = dataset
'probabilistic_model': expand('data/output/'+config['input_folder']+'/kmers/{kmer}/predictions.probabilistic_cov.tsv',kmer=kmer_lengths), inputIDs, = glob_wildcards('data/input/'+dataset+'/{id}'+config["datasets"][dataset]['input_read_1_ending'])
# if above: dataset_inputIDs[dataset] = inputIDs
'plot_top3_fit': expand('data/output/'+config['input_folder']+'/kmers/{kmer}/{id}_top3fit.svg',kmer=kmer_lengths,id=inputIDs),
'distance_model': expand('data/output/'+config['input_folder']+'/kmers/{kmer}/predictions.euclidean.tsv',kmer=kmer_lengths), possible_params = {'generative_model': expand('data/output/'+run_dir+'/kmers/{kmer}/predictions.probabilistic_gen.tsv',kmer=kmer_lengths),
'assembly_model': expand('data/output/'+config['input_folder']+'/{id}/exactMatches.tsv',id=inputIDs), 'probabilistic_model': expand('data/output/'+run_dir+'/kmers/{kmer}/predictions.probabilistic_cov.tsv',kmer=kmer_lengths),
'calc_strand_bias': expand('data/output/'+config['input_folder']+'/{id}/strandbias.txt',id=inputIDs), # if above:
'mapping_diff_analysis': expand('data/output/'+config['input_folder']+'/methodAnalysis/{id}/mapping.comparison',id=inputIDs), 'plot_top3_fit': expand('data/output/'+run_dir+'/kmers/{kmer}/{id}_top3fit.svg',kmer=kmer_lengths,id=inputIDs),
'map_filtered_reads': expand('data/output/'+config['input_folder']+'/methodAnalysis/{id}/alignmentToGroundTruthType.sorted.bam.bai',id=inputIDs), 'distance_model': expand('data/output/'+run_dir+'/kmers/{kmer}/predictions.euclidean.tsv',kmer=kmer_lengths),
'verifyUniqueness': expand('data/output/kmers/{kmer}/uniquenessTest.tsv',kmer=kmer_lengths), 'assembly_model': expand('data/output/'+run_dir+'/{id}/exactMatches.tsv',id=inputIDs),
'kmer_stats_analysis': expand('data/auxiliary/'+config['input_folder']+'/kmers/{kmer}/{id}/stats.tsv',kmer=kmer_lengths,id=inputIDs) + 'calc_strand_bias': expand('data/output/'+run_dir+'/{id}/strandbias.txt',id=inputIDs),
expand('data/output/'+config['input_folder']+'/kmers/{kmer}/{id}/spaTypesGroundTruthVennDia.svg',kmer=kmer_lengths,id=inputIDs) 'mapping_diff_analysis': expand('data/output/'+run_dir+'/methodAnalysis/{id}/mapping.comparison',id=inputIDs),
} 'map_filtered_reads': expand('data/output/'+run_dir+'/methodAnalysis/{id}/alignmentToGroundTruthType.sorted.bam.bai',id=inputIDs),
'verifyUniqueness': expand('data/output/kmers/{kmer}/uniquenessTest.tsv',kmer=kmer_lengths),
'kmer_stats_analysis': expand('data/auxiliary/'+run_dir+'/kmers/{kmer}/{id}/stats.tsv',kmer=kmer_lengths,id=inputIDs) +
expand('data/output/'+run_dir+'/kmers/{kmer}/{id}/spaTypesGroundTruthVennDia.svg',kmer=kmer_lengths,id=inputIDs)
}
return [possible_params[k] for k in possible_params.keys() if config[k]]
rule all: rule all:
input: input:
[possible_params[k] for k in possible_params.keys() if config[k]] [get_inputs(dataset) for dataset in config["datasets"].keys()]
##### load rules ##### ##### load rules #####
......
...@@ -43,9 +43,13 @@ protein_a_identifier : protein A #NCTC8325 ...@@ -43,9 +43,13 @@ protein_a_identifier : protein A #NCTC8325
l_lactate_permease_identifier : L-lactate permease #NCTC8325 l_lactate_permease_identifier : L-lactate permease #NCTC8325
arlp_identifier : accessory regulator-like protein #NCTC8325 arlp_identifier : accessory regulator-like protein #NCTC8325
input_folder : test datasets:
input_read_1_ending : _1.fq test:
input_read_2_ending : _2.fq input_read_1_ending : _1.fq
input_read_2_ending : _2.fq
test2:
input_read_1_ending : _R1.fastq
input_read_2_ending : _R2.fastq
### Method Analysis ### Method Analysis
......
rule spades: rule spades:
input: input:
read1 = 'data/auxiliary/' + config['input_folder'] + '/{id}' + '.qc' + config['input_read_1_ending'], read1 = 'data/auxiliary/{dataset}/{id}' + '.qc_internal_R1.fq',
read2 = 'data/auxiliary/' + config['input_folder'] + '/{id}' + '.qc' + config['input_read_2_ending'] read2 = 'data/auxiliary/{dataset}/{id}' + '.qc_internal_R2.fq'
output: output:
directory('data/auxiliary/' + config['input_folder'] + '/{id}/spades') directory('data/auxiliary/{dataset}/{id}/spades')
singularity: singularity:
'docker://pegi3s/spades:latest' 'docker://pegi3s/spades:latest'
shell: shell:
...@@ -13,12 +13,12 @@ rule spades: ...@@ -13,12 +13,12 @@ rule spades:
rule exactMatch: rule exactMatch:
input: input:
infolder = 'data/auxiliary/' + config['input_folder'] + '/{id}/spades' infolder = 'data/auxiliary/{dataset}/{id}/spades'
output: output:
'data/output/' + config['input_folder'] + '/{id}/exactMatches.tsv' 'data/output/{dataset}/{id}/exactMatches.tsv'
params: params:
spaSeqs = 'data/auxiliary/spaSequences.fa', spaSeqs = 'data/auxiliary/spaSequences.fa',
scaffolds = 'data/auxiliary/' + config['input_folder'] + '/{id}/spades/scaffolds.fasta' scaffolds = 'data/auxiliary/{dataset}/{id}/spades/scaffolds.fasta'
conda: conda:
'../envs/biopythonworkbench.yaml' '../envs/biopythonworkbench.yaml'
script: script:
......
rule estimateKmerCoverage: rule estimateKmerCoverage:
input: input:
read1 = 'data/auxiliary/'+config['input_folder']+'/{id}'+'.qc'+config['input_read_1_ending'], read1 = 'data/auxiliary/{dataset}/{id}'+'.qc_internal_R1.fq',
read2 = 'data/auxiliary/'+config['input_folder']+'/{id}'+'.qc'+config['input_read_2_ending'] read2 = 'data/auxiliary/{dataset}/{id}'+'.qc_internal_R2.fq'
output: output:
histogram = report('data/output/' + config['input_folder'] + '/kmers/{kmer}/{id}/kmers.histo.png',category='Coverage-Based Method',caption='../report/kmerCoverageHistogram.rst'), histogram = report('data/output/{dataset}/kmers/{kmer}/{id}/kmers.histo.png',category='Coverage-Based Method',caption='../report/kmerCoverageHistogram.rst'),
histogramRaw = 'data/auxiliary/' + config['input_folder'] + '/kmers/{kmer}/{id}/kmers.histo.raw.png', histogramRaw = 'data/auxiliary/{dataset}/kmers/{kmer}/{id}/kmers.histo.raw.png',
mean = 'data/auxiliary/' + config['input_folder'] + '/kmers/{kmer}/{id}/coverage_estimate_kmercountbased.txt', mean = 'data/auxiliary/{dataset}/kmers/{kmer}/{id}/coverage_estimate_kmercountbased.txt',
kmererror = 'data/auxiliary/' + config['input_folder'] + '/kmers/{kmer}/{id}/kmer_error.txt' kmererror = 'data/auxiliary/{dataset}/kmers/{kmer}/{id}/kmer_error.txt'
params: params:
k = lambda wildcards: wildcards.kmer, k = lambda wildcards: wildcards.kmer,
#cluster execution #cluster execution
...@@ -15,9 +15,9 @@ rule estimateKmerCoverage: ...@@ -15,9 +15,9 @@ rule estimateKmerCoverage:
mem = '64G', mem = '64G',
walltime = '00:45:00' walltime = '00:45:00'
log: log:
'logs/' + config['input_folder'] + '/kmers/{kmer}/{id}/estimateKmerCoverage.log' 'logs/{dataset}/kmers/{kmer}/{id}/estimateKmerCoverage.log'
benchmark: benchmark:
'benchmarks/' + config['input_folder'] + '/kmers/{kmer}/{id}/estimateKmerCoverage.log' 'benchmarks/{dataset}/kmers/{kmer}/{id}/estimateKmerCoverage.log'
conda: conda:
'../envs/biopythonworkbench.yaml' '../envs/biopythonworkbench.yaml'
script: script:
...@@ -27,14 +27,14 @@ rule estimateKmerCoverage: ...@@ -27,14 +27,14 @@ rule estimateKmerCoverage:
''' '''
rule estimateKmerCoverageFiltered: rule estimateKmerCoverageFiltered:
input: input:
reads = 'data/auxiliary/'+config['input_folder']+'/{id}/filteredReads.fastq' reads = 'data/auxiliary/{dataset}/{id}/filteredReads.fastq'
output: output:
histogram = 'data/auxiliary/'+config['input_folder']+'/kmers/{kmer}/{id}/kmers.histo.regionXOnly.png' histogram = 'data/auxiliary/{dataset}/kmers/{kmer}/{id}/kmers.histo.regionXOnly.png'
params: params:
k = lambda wildcards: wildcards.kmer k = lambda wildcards: wildcards.kmer
#TODO: Threads = 2 ? #TODO: Threads = 2 ?
log: log:
'logs/'+config['input_folder']+'/kmers/{kmer}/{id}/estimateKmerCoverageFiltered.log' 'logs/{dataset}/kmers/{kmer}/{id}/estimateKmerCoverageFiltered.log'
conda: conda:
'../envs/main.yaml' '../envs/main.yaml'
script: script:
...@@ -44,11 +44,11 @@ rule estimateKmerCoverageFiltered: ...@@ -44,11 +44,11 @@ rule estimateKmerCoverageFiltered:
rule estimateKmerCoverageAlignment: rule estimateKmerCoverageAlignment:
input: input:
coverageEstimate = 'data/auxiliary/'+config['input_folder']+'/{id}/coverageEstimate.txt', coverageEstimate = 'data/auxiliary/{dataset}/{id}/coverageEstimate.txt',
readLengthEstimate = 'data/auxiliary/'+config['input_folder']+'/{id}/readLengthEstimate.txt', readLengthEstimate = 'data/auxiliary/{dataset}/{id}/readLengthEstimate.txt',
baseErrorEstimate = 'data/auxiliary/'+config['input_folder']+'/{id}/base_error_estimate.txt' baseErrorEstimate = 'data/auxiliary/{dataset}/{id}/base_error_estimate.txt'
output: output:
kmerCoverage = 'data/auxiliary/'+config['input_folder']+'/kmers/{kmer}/{id}/coverage_estimate_alignmentbased.txt' kmerCoverage = 'data/auxiliary/{dataset}/kmers/{kmer}/{id}/coverage_estimate_alignmentbased.txt'
params: params:
k = lambda wildcards: wildcards.kmer, k = lambda wildcards: wildcards.kmer,
# cluster execution # cluster execution
...@@ -57,7 +57,7 @@ rule estimateKmerCoverageAlignment: ...@@ -57,7 +57,7 @@ rule estimateKmerCoverageAlignment:
mem = '16G', mem = '16G',
walltime = '00:30:30' walltime = '00:30:30'
log: log:
'logs/'+config['input_folder']+'/kmers/{kmer}/{id}/estimateKmerCoverage.log' 'logs/{dataset}/kmers/{kmer}/{id}/estimateKmerCoverage.log'
conda: conda:
'../envs/biopythonworkbench.yaml' '../envs/biopythonworkbench.yaml'
script: script:
...@@ -65,9 +65,9 @@ rule estimateKmerCoverageAlignment: ...@@ -65,9 +65,9 @@ rule estimateKmerCoverageAlignment:
rule estimateCoverageAlignment: rule estimateCoverageAlignment:
input: input:
filteredAlignment = 'data/auxiliary/'+config['input_folder']+'/{id}/alignment.sorted.bam' filteredAlignment = 'data/auxiliary/{dataset}/{id}/alignment.sorted.bam'
output: output:
coverageEstimate = 'data/auxiliary/'+config['input_folder']+'/{id}/coverageEstimate.txt' coverageEstimate = 'data/auxiliary/{dataset}/{id}/coverageEstimate.txt'
params: params:
# cluster execution # cluster execution
cpus = '1', cpus = '1',
...@@ -75,7 +75,7 @@ rule estimateCoverageAlignment: ...@@ -75,7 +75,7 @@ rule estimateCoverageAlignment:
mem = '16G', mem = '16G',
walltime = '00:30:30' walltime = '00:30:30'
log: log:
'logs/'+config['input_folder']+'/{id}/estimateKmerCoverage_alignment.log' 'logs/{dataset}/{id}/estimateKmerCoverage_alignment.log'
conda: conda:
'../envs/biopythonworkbench.yaml' '../envs/biopythonworkbench.yaml'
shell: shell:
...@@ -83,9 +83,9 @@ rule estimateCoverageAlignment: ...@@ -83,9 +83,9 @@ rule estimateCoverageAlignment:
rule calcPriorProbabilitiesCoverage: rule calcPriorProbabilitiesCoverage:
input: input:
likelihoods = 'data/auxiliary/'+config['input_folder']+'/kmers/{kmer}/{id}/likelihoods_cov.json' likelihoods = 'data/auxiliary/{dataset}/kmers/{kmer}/{id}/likelihoods_cov.json'
output: output:
priorFilePath = 'data/auxiliary/'+config['input_folder']+'/kmers/{kmer}/{id}/prior_cov.txt' priorFilePath = 'data/auxiliary/{dataset}/kmers/{kmer}/{id}/prior_cov.txt'
params: params:
k = lambda wildcards: wildcards.kmer, k = lambda wildcards: wildcards.kmer,
dps = config['dps'], dps = config['dps'],
...@@ -95,7 +95,7 @@ rule calcPriorProbabilitiesCoverage: ...@@ -95,7 +95,7 @@ rule calcPriorProbabilitiesCoverage:
mem = '2G', mem = '2G',
walltime = '00:05:30' walltime = '00:05:30'
log: log:
'logs/'+config['input_folder']+'/probabilistic/kmers/{kmer}/{id}/calcPrior_cov.log' 'logs/{dataset}/probabilistic/kmers/{kmer}/{id}/calcPrior_cov.log'
conda: conda:
'../envs/biopythonworkbench.yaml' '../envs/biopythonworkbench.yaml'
script: script:
...@@ -103,10 +103,10 @@ rule calcPriorProbabilitiesCoverage: ...@@ -103,10 +103,10 @@ rule calcPriorProbabilitiesCoverage:
rule calcProbabilitiesCoverage: rule calcProbabilitiesCoverage:
input: input:
likelihoods = 'data/auxiliary/'+config['input_folder']+'/kmers/{kmer}/{id}/likelihoods_cov.json', likelihoods = 'data/auxiliary/{dataset}/kmers/{kmer}/{id}/likelihoods_cov.json',
prior = 'data/auxiliary/'+config['input_folder']+'/kmers/{kmer}/{id}/prior_cov.txt' prior = 'data/auxiliary/{dataset}/kmers/{kmer}/{id}/prior_cov.txt'
output: output:
probabilities = 'data/auxiliary/'+config['input_folder']+'/kmers/{kmer}/{id}/scores.probabilistic_cov.tsv' probabilities = 'data/auxiliary/{dataset}/kmers/{kmer}/{id}/scores.probabilistic_cov.tsv'
params: params:
dps = config['dps'], dps = config['dps'],
# cluster execution # cluster execution
...@@ -117,18 +117,18 @@ rule calcProbabilitiesCoverage: ...@@ -117,18 +117,18 @@ rule calcProbabilitiesCoverage:
conda: conda:
'../envs/biopythonworkbench.yaml' '../envs/biopythonworkbench.yaml'
log: log:
'logs/'+config['input_folder']+'/probabilistic/kmers/{kmer}/{id}/probabilities_cov.log' 'logs/{dataset}/probabilistic/kmers/{kmer}/{id}/probabilities_cov.log'
script: script:
'../scripts/calcSpaTypeProbabilities.py' '../scripts/calcSpaTypeProbabilities.py'
rule createFitnessPlots: rule createFitnessPlots:
input: input:
counts = 'data/auxiliary/'+config['input_folder']+'/kmers/{kmer}/{id}/alignment.counts.json', counts = 'data/auxiliary/{dataset}/kmers/{kmer}/{id}/alignment.counts.json',
probabilities = 'data/auxiliary/'+config['input_folder']+'/kmers/{kmer}/{id}/scores.probabilistic_cov.tsv', probabilities = 'data/auxiliary/{dataset}/kmers/{kmer}/{id}/scores.probabilistic_cov.tsv',
ratios = 'data/auxiliary/kmers/{kmer}/spaSequencesRatios.json' ratios = 'data/auxiliary/kmers/{kmer}/spaSequencesRatios.json'
output: output:
report('data/output/'+config['input_folder']+'/kmers/{kmer}/{id}_top3fit.svg',category='Coverage-Based Method',caption='../report/fitnessPlot.rst') report('data/output/{dataset}/kmers/{kmer}/{id}_top3fit.svg',category='Coverage-Based Method',caption='../report/fitnessPlot.rst')
params: params:
dps = config['dps'], dps = config['dps'],
# cluster execution # cluster execution
...@@ -147,7 +147,7 @@ rule calcExpectedCounts: ...@@ -147,7 +147,7 @@ rule calcExpectedCounts:
kmerCoverageEstimate = determineKmerCoverageEstimateFile(), kmerCoverageEstimate = determineKmerCoverageEstimateFile(),
counts = 'data/auxiliary/kmers/{kmer}/spaSequences.counts.json' counts = 'data/auxiliary/kmers/{kmer}/spaSequences.counts.json'
output: output:
'data/auxiliary/'+config['input_folder']+'/kmers/{kmer}/{id}/expected_counts.json' 'data/auxiliary/{dataset}/kmers/{kmer}/{id}/expected_counts.json'
params: params:
k = lambda wildcards: wildcards.kmer, k = lambda wildcards: wildcards.kmer,
# cluster execution # cluster execution
......
rule distance: rule distance:
input: input:
readProfile = 'data/auxiliary/'+config['input_folder']+'/kmers/{kmer}/{id}/alignment.profile.json', readProfile = 'data/auxiliary/{dataset}/kmers/{kmer}/{id}/alignment.profile.json',
spaProfiles = 'data/auxiliary/kmers/{kmer}/spaSequences.kmerprofiles.json' spaProfiles = 'data/auxiliary/kmers/{kmer}/spaSequences.kmerprofiles.json'
output: output:
'data/auxiliary/'+config['input_folder']+'/kmers/{kmer}/{id}/scores.euclidean.tsv' 'data/auxiliary/{dataset}/kmers/{kmer}/{id}/scores.euclidean.tsv'
conda: conda:
'../envs/biopythonworkbench.yaml' '../envs/biopythonworkbench.yaml'
params: params:
......
# Returns the correct filename in which the required information is stored depending on the configuration setting # Returns the correct filename in which the required information is stored depending on the configuration setting
def determineKmerCoverageEstimateFile(): def determineKmerCoverageEstimateFile():
if config['kmerCoverageEstimationMethod'] == 'alignment': if config['kmerCoverageEstimationMethod'] == 'alignment':
return 'data/auxiliary/'+config['input_folder']+'/kmers/{kmer}/{id}/coverage_estimate_alignmentbased.txt' return 'data/auxiliary/{dataset}/kmers/{kmer}/{id}/coverage_estimate_alignmentbased.txt'
elif config['kmerCoverageEstimationMethod'] == 'countMean': elif config['kmerCoverageEstimationMethod'] == 'countMean':
return 'data/auxiliary/'+config['input_folder']+'/kmers/{kmer}/{id}/coverage_estimate_kmercountbased.txt' return 'data/auxiliary/{dataset}/kmers/{kmer}/{id}/coverage_estimate_kmercountbased.txt'
elif config['kmerCoverageEstimationMethod'] == 'countPoisson': elif config['kmerCoverageEstimationMethod'] == 'countPoisson':
return 'data/auxiliary/'+config['input_folder']+'/kmers/{kmer}/{id}/coverage_estimate_kmercountbased.txt' return 'data/auxiliary/{dataset}/kmers/{kmer}/{id}/coverage_estimate_kmercountbased.txt'
# Returns the corresponding ground truth spa-type for a given file (sample) id # Returns the corresponding ground truth spa-type for a given file (sample) id
def getGroundTruthType(fid): def getGroundTruthType(fid):
...@@ -75,10 +75,10 @@ rule bwa: ...@@ -75,10 +75,10 @@ rule bwa:
input: input:
bwi = 'data/auxiliary/matchBoard.fa.bwt', bwi = 'data/auxiliary/matchBoard.fa.bwt',
mb = 'data/auxiliary/matchBoard.fa', mb = 'data/auxiliary/matchBoard.fa',
read1 = 'data/auxiliary/' + config['input_folder'] + '/{id}' + '.qc' + config['input_read_1_ending'], read1 = 'data/auxiliary/{dataset}/{id}' + '.qc_internal_R1.fq',
read2 = 'data/auxiliary/' + config['input_folder'] + '/{id}' + '.qc' + config['input_read_2_ending'] read2 = 'data/auxiliary/{dataset}/{id}' + '.qc_internal_R2.fq'
output: output:
'data/auxiliary/'+config['input_folder']+'/{id}/alignment.bam' 'data/auxiliary/{dataset}/{id}/alignment.bam'
params: params:
#cluster execution #cluster execution
cpus = '1', cpus = '1',
...@@ -95,10 +95,10 @@ rule bwa: ...@@ -95,10 +95,10 @@ rule bwa:
rule determineStrandBias: rule determineStrandBias:
input: input:
alignment = 'data/auxiliary/'+config['input_folder']+'/{id}/alignment.sorted.bam', alignment = 'data/auxiliary/{dataset}/{id}/alignment.sorted.bam',
idx = 'data/auxiliary/'+config['input_folder']+'/{id}/alignment.sorted.bam.bai' idx = 'data/auxiliary/{dataset}/{id}/alignment.sorted.bam.bai'
output: output:
strandbias = report('data/output/'+config['input_folder']+'/{id}/strandbias.txt',category='Strand Bias') strandbias = report('data/output/{dataset}/{id}/strandbias.txt',category='Strand Bias')
params: params:
#cluster execution #cluster execution
cpus = '1', cpus = '1',
...@@ -114,10 +114,10 @@ rule determineStrandBias: ...@@ -114,10 +114,10 @@ rule determineStrandBias:
rule filter_primary_matches: rule filter_primary_matches:
input: input:
alignment = 'data/auxiliary/'+config['input_folder']+'/{id}/alignment.sorted.bam', alignment = 'data/auxiliary/{dataset}/{id}/alignment.sorted.bam',
whitelist = 'data/auxiliary/syntheticProteinAs.meta' whitelist = 'data/auxiliary/syntheticProteinAs.meta'
output: output:
'data/auxiliary/'+config['input_folder']+'/{id}/alignment.filtered.bam' 'data/auxiliary/{dataset}/{id}/alignment.filtered.bam'
params: params:
#cluster execution #cluster execution
cpus = '1', cpus = '1',
...@@ -131,10 +131,10 @@ rule filter_primary_matches: ...@@ -131,10 +131,10 @@ rule filter_primary_matches:
rule determineFilteredStrandBias: rule determineFilteredStrandBias:
input: input:
alignment = 'data/auxiliary/'+config['input_folder']+'/{id}/alignment.filtered.bam', alignment = 'data/auxiliary/{dataset}/{id}/alignment.filtered.bam',
idx = 'data/auxiliary/'+config['input_folder']+'/{id}/alignment.filtered.bam.bai' idx = 'data/auxiliary/{dataset}/{id}/alignment.filtered.bam.bai'
output: output:
strandbias = 'data/auxiliary/'+config['input_folder']+'/{id}/strandbias.filtered.txt' strandbias = 'data/auxiliary/{dataset}/{id}/strandbias.filtered.txt'
params: params:
#cluster execution #cluster execution
cpus = '1', cpus = '1',
...@@ -148,9 +148,9 @@ rule determineFilteredStrandBias: ...@@ -148,9 +148,9 @@ rule determineFilteredStrandBias:
rule extractFilteredReadsAsFastQ: rule extractFilteredReadsAsFastQ:
input: input:
filteredAlignment = 'data/auxiliary/'+config['input_folder']+'/{id}/alignment.filtered.bam' filteredAlignment = 'data/auxiliary/{dataset}/{id}/alignment.filtered.bam'
output: output:
filteredReads = 'data/auxiliary/'+config['input_folder']+'/{id}/filteredReads.fastq' filteredReads = 'data/auxiliary/{dataset}/{id}/filteredReads.fastq'
params: params:
#cluster execution #cluster execution
cpus = '1', cpus = '1',
...@@ -167,13 +167,13 @@ rule extractFilteredReadsAsFastQ: ...@@ -167,13 +167,13 @@ rule extractFilteredReadsAsFastQ:
rule createKmerDistributionGroundTruth_COVERAGE_BASED: rule createKmerDistributionGroundTruth_COVERAGE_BASED:
input: input:
expectedCounts = 'data/auxiliary/'+config['input_folder']+'/kmers/{kmer}/{id}/expected_counts.json', expectedCounts = 'data/auxiliary/{dataset}/kmers/{kmer}/{id}/expected_counts.json',
observedCounts = 'data/auxiliary/'+config['input_folder']+'/kmers/{kmer}/{id}/alignment.counts.json', observedCounts = 'data/auxiliary/{dataset}/kmers/{kmer}/{id}/alignment.counts.json',
probabilities = 'data/auxiliary/'+config['input_folder']+'/kmers/{kmer}/{id}/scores.probabilistic_cov.tsv', probabilities = 'data/auxiliary/{dataset}/kmers/{kmer}/{id}/scores.probabilistic_cov.tsv',
kmerError = 'data/auxiliary/'+config['input_folder']+'/kmers/{kmer}/{id}/kmer_error.txt' kmerError = 'data/auxiliary/{dataset}/kmers/{kmer}/{id}/kmer_error.txt'
output: output:
errors = 'data/output/'+config['input_folder']+'/methodAnalysis/{kmer}/{id}/kmerErrorDistributions.svg', errors = 'data/output/{dataset}/methodAnalysis/{kmer}/{id}/kmerErrorDistributions.svg',
deviations = 'data/output/'+config['input_folder']+'/methodAnalysis/{kmer}/{id}/countDeviations.svg' deviations = 'data/output/{dataset}/methodAnalysis/{kmer}/{id}/countDeviations.svg'
params: params:
gt = lambda wildcards : getGroundTruthType(wildcards.id), gt = lambda wildcards : getGroundTruthType(wildcards.id),
#cluster execution #cluster execution
...@@ -184,35 +184,35 @@ rule createKmerDistributionGroundTruth_COVERAGE_BASED: ...@@ -184,35 +184,35 @@ rule createKmerDistributionGroundTruth_COVERAGE_BASED:
conda: conda:
'../envs/biopythonworkbench.yaml' '../envs/biopythonworkbench.yaml'
log: log:
'logs/'+config['input_folder']+'/methodAnalysis/{kmer}/{id}/kmerErrorDistributions.svg' 'logs/{dataset}/methodAnalysis/{kmer}/{id}/kmerErrorDistributions.svg'
script: script:
'../scripts/createKmerErrorDistributionPlots.py' '../scripts/createKmerErrorDistributionPlots.py'
rule likelihoodAnalysis_COVERAGE_BASED: rule likelihoodAnalysis_COVERAGE_BASED:
input: input:
expectedCounts = 'data/auxiliary/'+config['input_folder']+'/kmers/{kmer}/{id}/expected_counts.json', expectedCounts = 'data/auxiliary/{dataset}/kmers/{kmer}/{id}/expected_counts.json',
observedCounts = 'data/auxiliary/'+config['input_folder']+'/kmers/{kmer}/{id}/alignment.counts.json', observedCounts = 'data/auxiliary/{dataset}/kmers/{kmer}/{id}/alignment.counts.json',
probabilities = 'data/auxiliary/'+config['input_folder']+'/kmers/{kmer}/{id}/scores.probabilistic_cov.tsv', probabilities = 'data/auxiliary/{dataset}/kmers/{kmer}/{id}/scores.probabilistic_cov.tsv',
kmerError = 'data/auxiliary/'+config['input_folder']+'/kmers/{kmer}/{id}/kmer_error.txt' kmerError = 'data/auxiliary/{dataset}/kmers/{kmer}/{id}/kmer_error.txt'
output: output:
likelihoodAnalysis = 'data/output/'+config['input_folder']+'/methodAnalysis/{kmer}/{id}/likelihoodAnalysis.txt' likelihoodAnalysis = 'data/output/{dataset}/methodAnalysis/{kmer}/{id}/likelihoodAnalysis.txt'
params: params:
gt = lambda wildcards : getGroundTruthType(wildcards.id) gt = lambda wildcards : getGroundTruthType(wildcards.id)
conda: conda:
'../envs/biopythonworkbench.yaml' '../envs/biopythonworkbench.yaml'
log: log:
'logs/'+config['input_folder']+'/methodAnalysis/{kmer}/{id}/likelihoodAnalysis.txt' 'logs/{dataset}/methodAnalysis/{kmer}/{id}/likelihoodAnalysis.txt'
script: script:
'../scripts/likelihoodBreakdown.py' '../scripts/likelihoodBreakdown.py'
rule mapAgainstGroundTruth: rule mapAgainstGroundTruth:
input: input:
filteredReads = 'data/auxiliary/'+config['input_folder']+'/{id}/filteredReads.fastq', filteredReads = 'data/auxiliary/{dataset}/{id}/filteredReads.fastq',
groundTruthSequence = lambda wildcards: 'data/input/ref/'+getGroundTruthType(wildcards.id)+'.fa', groundTruthSequence = lambda wildcards: 'data/input/ref/'+getGroundTruthType(wildcards.id)+'.fa',
groundTruthIndex = lambda wildcards: 'data/input/ref/'+getGroundTruthType(wildcards.id)+'.fa.bwt' groundTruthIndex = lambda wildcards: 'data/input/ref/'+getGroundTruthType(wildcards.id)+'.fa.bwt'
output: output:
'data/output/'+config['input_folder']+'/methodAnalysis/{id}/alignmentToGroundTruthType.bam' 'data/output/{dataset}/methodAnalysis/{id}/alignmentToGroundTruthType.bam'
params: params:
# cluster execution # cluster execution
cpus = '1', cpus = '1',
...@@ -244,14 +244,14 @@ rule verifyUniqueness: ...@@ -244,14 +244,14 @@ rule verifyUniqueness:
rule analyzeMapping: rule analyzeMapping:
input: input:
read1 = 'data/auxiliary/'+config['input_folder']+'/{id}'+'.qc'+config['input_read_1_ending'], read1 = 'data/auxiliary/{dataset}/{id}'+'.qc_internal_R1.fq',
read2 = 'data/auxiliary/'+config['input_folder']+'/{id}'+'.qc'+config['input_read_2_ending'], read2 = 'data/auxiliary/{dataset}/{id}'+'.qc_internal_R2.fq',
filteredAlignment = 'data/auxiliary/'+config['input_folder']+'/{id}/alignment.filtered.bam', filteredAlignment = 'data/auxiliary/{dataset}/{id}/alignment.filtered.bam',
idx = 'data/auxiliary/'+config['input_folder']+'/{id}/alignment.filtered.bam.bai', idx = 'data/auxiliary/{dataset}/{id}/alignment.filtered.bam.bai',
metaInf = lambda wildcards : 'data/input/'+config['input_folder']+'/syntheticReferencesMetaData/'+getGroundTruthType(wildcards.id)+'.meta' metaInf = lambda wildcards : 'data/input/{dataset}/syntheticReferencesMetaData/'+getGroundTruthType(wildcards.id)+'.meta'
output: output:
correctAlignments = 'data/output/'+config['input_folder']+'/methodAnalysis/{id}/correctMapping.fa', correctAlignments = 'data/output/{dataset}/methodAnalysis/{id}/correctMapping.fa',
analysis = 'data/output/'+config['input_folder']+'/methodAnalysis/{id}/mapping.comparison' analysis = 'data/output/{dataset}/methodAnalysis/{id}/mapping.comparison'
conda: conda:
'../envs/biopythonworkbench.yaml' '../envs/biopythonworkbench.yaml'
params: params:
...@@ -265,11 +265,11 @@ rule analyzeMapping: ...@@ -265,11 +265,11 @@ rule analyzeMapping:
rule makeKmerProfilesFromTrueReads: rule makeKmerProfilesFromTrueReads:
input: input:
filteredReads = 'data/output/'+config['input_folder']+'/methodAnalysis/{id}/correctMapping.fa', filteredReads = 'data/output/{dataset}/methodAnalysis/{id}/correctMapping.fa',
regionXMetaData = lambda wildcards : 'data/input/'+config['input_folder']+'/syntheticReferencesMetaData/'+getGroundTruthType(wildcards.id)+'.meta' regionXMetaData = lambda wildcards : 'data/input/{dataset}/syntheticReferencesMetaData/'+getGroundTruthType(wildcards.id)+'.meta'
output: output:
counts = 'data/output/'+config['input_folder']+'/methodAnalysis/{id}/{kmer}/correctCounts.json', counts = 'data/output/{dataset}/methodAnalysis/{id}/{kmer}/correctCounts.json',
origins = 'data/output/'+config['input_folder']+'/methodAnalysis/{id}/{kmer}/kmerOrigins.json' origins = 'data/output/{dataset}/methodAnalysis/{id}/{kmer}/kmerOrigins.json'
params: params:
k = lambda wildcards: wildcards.kmer k = lambda wildcards: wildcards.kmer
conda: conda:
...@@ -279,10 +279,10 @@ rule makeKmerProfilesFromTrueReads: ...@@ -279,10 +279,10 @@ rule makeKmerProfilesFromTrueReads:
rule compareObservedKmerProfileToTrueProfile: rule compareObservedKmerProfileToTrueProfile:
input: input:
trueCounts = 'data/output/'+config['input_folder']+'/methodAnalysis/{id}/{kmer}/correctCounts.json', trueCounts = 'data/output/{dataset}/methodAnalysis/{id}/{kmer}/correctCounts.json',
observedCounts = 'data/auxiliary/'+config['input_folder']+'/kmers/{kmer}/{id}/alignment.counts.json' observedCounts = 'data/auxiliary/{dataset}/kmers/{kmer}/{id}/alignment.counts.json'
output: output:
differences = 'data/output/'+config['input_folder']+'/methodAnalysis/{id}/{kmer}/differences_observed.txt' differences = 'data/output/{dataset}/methodAnalysis/{id}/{kmer}/differences_observed.txt'
conda: conda:
'../envs/biopythonworkbench.yaml' '../envs/biopythonworkbench.yaml'
script: script:
...@@ -290,11 +290,11 @@ rule compareObservedKmerProfileToTrueProfile: ...@@ -290,11 +290,11 @@ rule compareObservedKmerProfileToTrueProfile:
rule compareExpectedKmerProfileToObserved: rule compareExpectedKmerProfileToObserved:
input: input:
trueCounts = 'data/auxiliary/'+config['input_folder']+'/kmers/{kmer}/{id}/alignment.counts.json', trueCounts = 'data/auxiliary/{dataset}/kmers/{kmer}/{id}/alignment.counts.json',
expectedCounts = 'data/auxiliary/'+config['input_folder']+'/kmers/{kmer}/{id}/expected_counts.json', expectedCounts = 'data/auxiliary/{dataset}/kmers/{kmer}/{id}/expected_counts.json',
groundTruthFile = 'data/input/' + config['ground_truth'] groundTruthFile = 'data/input/' + config['ground_truth']
output: output:
differences = 'data/output/'+config['input_folder']+'/methodAnalysis/{id}/{kmer}/differences_observed_expected.txt' differences = 'data/output/{dataset}/methodAnalysis/{id}/{kmer}/differences_observed_expected.txt'
params: params:
inputFileID = lambda wildcards: wildcards.id inputFileID = lambda wildcards: wildcards.id
conda: conda:
...@@ -324,9 +324,9 @@ rule makeSequenceProfiles: ...@@ -324,9 +324,9 @@ rule makeSequenceProfiles: