Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Sign in
Toggle navigation
Menu
Open sidebar
Jan Hoeckesfeld
Snakemake Ngs Spa Typing
Commits
cc4d7666
Commit
cc4d7666
authored
Jan 20, 2021
by
Jan Hoeckesfeld
Browse files
added wildcard datasets
parent
0a533407
Changes
9
Hide whitespace changes
Inline
Side-by-side
Snakefile
View file @
cc4d7666
...
...
@@ -9,29 +9,35 @@ validate(config, "schemas/config.schema.yaml")
#Generate Input/Output Files from specified folder
inputIDs, = glob_wildcards('data/input/'+config['input_folder']+'/{id}'+config['input_read_1_ending'])
kmer_lengths = config['kmers']
kmer_lengths = config['kmers']
dataset_inputIDs = {}
#kmer_lengths = [24]
possible_params = {'generative_model': expand('data/output/'+config['input_folder']+'/kmers/{kmer}/predictions.probabilistic_gen.tsv',kmer=kmer_lengths),
'probabilistic_model': expand('data/output/'+config['input_folder']+'/kmers/{kmer}/predictions.probabilistic_cov.tsv',kmer=kmer_lengths),
# if above:
'plot_top3_fit': expand('data/output/'+config['input_folder']+'/kmers/{kmer}/{id}_top3fit.svg',kmer=kmer_lengths,id=inputIDs),
'distance_model': expand('data/output/'+config['input_folder']+'/kmers/{kmer}/predictions.euclidean.tsv',kmer=kmer_lengths),
'assembly_model': expand('data/output/'+config['input_folder']+'/{id}/exactMatches.tsv',id=inputIDs),
'calc_strand_bias': expand('data/output/'+config['input_folder']+'/{id}/strandbias.txt',id=inputIDs),
'mapping_diff_analysis': expand('data/output/'+config['input_folder']+'/methodAnalysis/{id}/mapping.comparison',id=inputIDs),
'map_filtered_reads': expand('data/output/'+config['input_folder']+'/methodAnalysis/{id}/alignmentToGroundTruthType.sorted.bam.bai',id=inputIDs),
'verifyUniqueness': expand('data/output/kmers/{kmer}/uniquenessTest.tsv',kmer=kmer_lengths),
'kmer_stats_analysis': expand('data/auxiliary/'+config['input_folder']+'/kmers/{kmer}/{id}/stats.tsv',kmer=kmer_lengths,id=inputIDs) +
expand('data/output/'+config['input_folder']+'/kmers/{kmer}/{id}/spaTypesGroundTruthVennDia.svg',kmer=kmer_lengths,id=inputIDs)
}
def get_inputs(dataset):
run_dir = dataset
inputIDs, = glob_wildcards('data/input/'+dataset+'/{id}'+config["datasets"][dataset]['input_read_1_ending'])
dataset_inputIDs[dataset] = inputIDs
possible_params = {'generative_model': expand('data/output/'+run_dir+'/kmers/{kmer}/predictions.probabilistic_gen.tsv',kmer=kmer_lengths),
'probabilistic_model': expand('data/output/'+run_dir+'/kmers/{kmer}/predictions.probabilistic_cov.tsv',kmer=kmer_lengths),
# if above:
'plot_top3_fit': expand('data/output/'+run_dir+'/kmers/{kmer}/{id}_top3fit.svg',kmer=kmer_lengths,id=inputIDs),
'distance_model': expand('data/output/'+run_dir+'/kmers/{kmer}/predictions.euclidean.tsv',kmer=kmer_lengths),
'assembly_model': expand('data/output/'+run_dir+'/{id}/exactMatches.tsv',id=inputIDs),
'calc_strand_bias': expand('data/output/'+run_dir+'/{id}/strandbias.txt',id=inputIDs),
'mapping_diff_analysis': expand('data/output/'+run_dir+'/methodAnalysis/{id}/mapping.comparison',id=inputIDs),
'map_filtered_reads': expand('data/output/'+run_dir+'/methodAnalysis/{id}/alignmentToGroundTruthType.sorted.bam.bai',id=inputIDs),
'verifyUniqueness': expand('data/output/kmers/{kmer}/uniquenessTest.tsv',kmer=kmer_lengths),
'kmer_stats_analysis': expand('data/auxiliary/'+run_dir+'/kmers/{kmer}/{id}/stats.tsv',kmer=kmer_lengths,id=inputIDs) +
expand('data/output/'+run_dir+'/kmers/{kmer}/{id}/spaTypesGroundTruthVennDia.svg',kmer=kmer_lengths,id=inputIDs)
}
return [possible_params[k] for k in possible_params.keys() if config[k]]
rule all:
input:
[
possible_params[k] for k in possible_params.keys() if config[k]
]
[
get_inputs(dataset) for dataset in config["datasets"].keys()
]
##### load rules #####
...
...
config.example.yaml
View file @
cc4d7666
...
...
@@ -43,9 +43,13 @@ protein_a_identifier : protein A #NCTC8325
l_lactate_permease_identifier
:
L-lactate permease
#NCTC8325
arlp_identifier
:
accessory regulator-like protein
#NCTC8325
input_folder
:
test
input_read_1_ending
:
_1.fq
input_read_2_ending
:
_2.fq
datasets
:
test
:
input_read_1_ending
:
_1.fq
input_read_2_ending
:
_2.fq
test2
:
input_read_1_ending
:
_R1.fastq
input_read_2_ending
:
_R2.fastq
### Method Analysis
...
...
rules/assembly.smk
View file @
cc4d7666
rule spades:
input:
read1 = 'data/auxiliary/
' + config['input_folder'] + '/{id}' + '.qc' + config['input_read_1_ending']
,
read2 = 'data/auxiliary/
' + config['input_folder'] + '/{id}' + '.qc' + config['input_read_2_ending']
read1 = 'data/auxiliary/
{dataset}/{id}' + '.qc_internal_R1.fq'
,
read2 = 'data/auxiliary/
{dataset}/{id}' + '.qc_internal_R2.fq'
output:
directory('data/auxiliary/
' + config['input_folder'] + '
/{id}/spades')
directory('data/auxiliary/
{dataset}
/{id}/spades')
singularity:
'docker://pegi3s/spades:latest'
shell:
...
...
@@ -13,12 +13,12 @@ rule spades:
rule exactMatch:
input:
infolder = 'data/auxiliary/
' + config['input_folder'] + '
/{id}/spades'
infolder = 'data/auxiliary/
{dataset}
/{id}/spades'
output:
'data/output/
' + config['input_folder'] + '
/{id}/exactMatches.tsv'
'data/output/
{dataset}
/{id}/exactMatches.tsv'
params:
spaSeqs = 'data/auxiliary/spaSequences.fa',
scaffolds = 'data/auxiliary/
' + config['input_folder'] + '
/{id}/spades/scaffolds.fasta'
scaffolds = 'data/auxiliary/
{dataset}
/{id}/spades/scaffolds.fasta'
conda:
'../envs/biopythonworkbench.yaml'
script:
...
...
rules/coverageBased.smk
View file @
cc4d7666
rule estimateKmerCoverage:
input:
read1 = 'data/auxiliary/
'+config['input_folder']+'/{id}'+'.qc'+config['input_read_1_ending']
,
read2 = 'data/auxiliary/
'+config['input_folder']+'/{id}'+'.qc'+config['input_read_2_ending']
read1 = 'data/auxiliary/
{dataset}/{id}'+'.qc_internal_R1.fq'
,
read2 = 'data/auxiliary/
{dataset}/{id}'+'.qc_internal_R2.fq'
output:
histogram = report('data/output/
' + config['input_folder'] + '
/kmers/{kmer}/{id}/kmers.histo.png',category='Coverage-Based Method',caption='../report/kmerCoverageHistogram.rst'),
histogramRaw = 'data/auxiliary/
' + config['input_folder'] + '
/kmers/{kmer}/{id}/kmers.histo.raw.png',
mean = 'data/auxiliary/
' + config['input_folder'] + '
/kmers/{kmer}/{id}/coverage_estimate_kmercountbased.txt',
kmererror = 'data/auxiliary/
' + config['input_folder'] + '
/kmers/{kmer}/{id}/kmer_error.txt'
histogram = report('data/output/
{dataset}
/kmers/{kmer}/{id}/kmers.histo.png',category='Coverage-Based Method',caption='../report/kmerCoverageHistogram.rst'),
histogramRaw = 'data/auxiliary/
{dataset}
/kmers/{kmer}/{id}/kmers.histo.raw.png',
mean = 'data/auxiliary/
{dataset}
/kmers/{kmer}/{id}/coverage_estimate_kmercountbased.txt',
kmererror = 'data/auxiliary/
{dataset}
/kmers/{kmer}/{id}/kmer_error.txt'
params:
k = lambda wildcards: wildcards.kmer,
#cluster execution
...
...
@@ -15,9 +15,9 @@ rule estimateKmerCoverage:
mem = '64G',
walltime = '00:45:00'
log:
'logs/
' + config['input_folder'] + '
/kmers/{kmer}/{id}/estimateKmerCoverage.log'
'logs/
{dataset}
/kmers/{kmer}/{id}/estimateKmerCoverage.log'
benchmark:
'benchmarks/
' + config['input_folder'] + '
/kmers/{kmer}/{id}/estimateKmerCoverage.log'
'benchmarks/
{dataset}
/kmers/{kmer}/{id}/estimateKmerCoverage.log'
conda:
'../envs/biopythonworkbench.yaml'
script:
...
...
@@ -27,14 +27,14 @@ rule estimateKmerCoverage:
'''
rule estimateKmerCoverageFiltered:
input:
reads = 'data/auxiliary/
'+config['input_folder']+'
/{id}/filteredReads.fastq'
reads = 'data/auxiliary/
{dataset}
/{id}/filteredReads.fastq'
output:
histogram = 'data/auxiliary/
'+config['input_folder']+'
/kmers/{kmer}/{id}/kmers.histo.regionXOnly.png'
histogram = 'data/auxiliary/
{dataset}
/kmers/{kmer}/{id}/kmers.histo.regionXOnly.png'
params:
k = lambda wildcards: wildcards.kmer
#TODO: Threads = 2 ?
log:
'logs/
'+config['input_folder']+'
/kmers/{kmer}/{id}/estimateKmerCoverageFiltered.log'
'logs/
{dataset}
/kmers/{kmer}/{id}/estimateKmerCoverageFiltered.log'
conda:
'../envs/main.yaml'
script:
...
...
@@ -44,11 +44,11 @@ rule estimateKmerCoverageFiltered:
rule estimateKmerCoverageAlignment:
input:
coverageEstimate = 'data/auxiliary/
'+config['input_folder']+'
/{id}/coverageEstimate.txt',
readLengthEstimate = 'data/auxiliary/
'+config['input_folder']+'
/{id}/readLengthEstimate.txt',
baseErrorEstimate = 'data/auxiliary/
'+config['input_folder']+'
/{id}/base_error_estimate.txt'
coverageEstimate = 'data/auxiliary/
{dataset}
/{id}/coverageEstimate.txt',
readLengthEstimate = 'data/auxiliary/
{dataset}
/{id}/readLengthEstimate.txt',
baseErrorEstimate = 'data/auxiliary/
{dataset}
/{id}/base_error_estimate.txt'
output:
kmerCoverage = 'data/auxiliary/
'+config['input_folder']+'
/kmers/{kmer}/{id}/coverage_estimate_alignmentbased.txt'
kmerCoverage = 'data/auxiliary/
{dataset}
/kmers/{kmer}/{id}/coverage_estimate_alignmentbased.txt'
params:
k = lambda wildcards: wildcards.kmer,
# cluster execution
...
...
@@ -57,7 +57,7 @@ rule estimateKmerCoverageAlignment:
mem = '16G',
walltime = '00:30:30'
log:
'logs/
'+config['input_folder']+'
/kmers/{kmer}/{id}/estimateKmerCoverage.log'
'logs/
{dataset}
/kmers/{kmer}/{id}/estimateKmerCoverage.log'
conda:
'../envs/biopythonworkbench.yaml'
script:
...
...
@@ -65,9 +65,9 @@ rule estimateKmerCoverageAlignment:
rule estimateCoverageAlignment:
input:
filteredAlignment = 'data/auxiliary/
'+config['input_folder']+'
/{id}/alignment.sorted.bam'
filteredAlignment = 'data/auxiliary/
{dataset}
/{id}/alignment.sorted.bam'
output:
coverageEstimate = 'data/auxiliary/
'+config['input_folder']+'
/{id}/coverageEstimate.txt'
coverageEstimate = 'data/auxiliary/
{dataset}
/{id}/coverageEstimate.txt'
params:
# cluster execution
cpus = '1',
...
...
@@ -75,7 +75,7 @@ rule estimateCoverageAlignment:
mem = '16G',
walltime = '00:30:30'
log:
'logs/
'+config['input_folder']+'
/{id}/estimateKmerCoverage_alignment.log'
'logs/
{dataset}
/{id}/estimateKmerCoverage_alignment.log'
conda:
'../envs/biopythonworkbench.yaml'
shell:
...
...
@@ -83,9 +83,9 @@ rule estimateCoverageAlignment:
rule calcPriorProbabilitiesCoverage:
input:
likelihoods = 'data/auxiliary/
'+config['input_folder']+'
/kmers/{kmer}/{id}/likelihoods_cov.json'
likelihoods = 'data/auxiliary/
{dataset}
/kmers/{kmer}/{id}/likelihoods_cov.json'
output:
priorFilePath = 'data/auxiliary/
'+config['input_folder']+'
/kmers/{kmer}/{id}/prior_cov.txt'
priorFilePath = 'data/auxiliary/
{dataset}
/kmers/{kmer}/{id}/prior_cov.txt'
params:
k = lambda wildcards: wildcards.kmer,
dps = config['dps'],
...
...
@@ -95,7 +95,7 @@ rule calcPriorProbabilitiesCoverage:
mem = '2G',
walltime = '00:05:30'
log:
'logs/
'+config['input_folder']+'
/probabilistic/kmers/{kmer}/{id}/calcPrior_cov.log'
'logs/
{dataset}
/probabilistic/kmers/{kmer}/{id}/calcPrior_cov.log'
conda:
'../envs/biopythonworkbench.yaml'
script:
...
...
@@ -103,10 +103,10 @@ rule calcPriorProbabilitiesCoverage:
rule calcProbabilitiesCoverage:
input:
likelihoods = 'data/auxiliary/
'+config['input_folder']+'
/kmers/{kmer}/{id}/likelihoods_cov.json',
prior = 'data/auxiliary/
'+config['input_folder']+'
/kmers/{kmer}/{id}/prior_cov.txt'
likelihoods = 'data/auxiliary/
{dataset}
/kmers/{kmer}/{id}/likelihoods_cov.json',
prior = 'data/auxiliary/
{dataset}
/kmers/{kmer}/{id}/prior_cov.txt'
output:
probabilities = 'data/auxiliary/
'+config['input_folder']+'
/kmers/{kmer}/{id}/scores.probabilistic_cov.tsv'
probabilities = 'data/auxiliary/
{dataset}
/kmers/{kmer}/{id}/scores.probabilistic_cov.tsv'
params:
dps = config['dps'],
# cluster execution
...
...
@@ -117,18 +117,18 @@ rule calcProbabilitiesCoverage:
conda:
'../envs/biopythonworkbench.yaml'
log:
'logs/
'+config['input_folder']+'
/probabilistic/kmers/{kmer}/{id}/probabilities_cov.log'
'logs/
{dataset}
/probabilistic/kmers/{kmer}/{id}/probabilities_cov.log'
script:
'../scripts/calcSpaTypeProbabilities.py'
rule createFitnessPlots:
input:
counts = 'data/auxiliary/
'+config['input_folder']+'
/kmers/{kmer}/{id}/alignment.counts.json',
probabilities = 'data/auxiliary/
'+config['input_folder']+'
/kmers/{kmer}/{id}/scores.probabilistic_cov.tsv',
counts = 'data/auxiliary/
{dataset}
/kmers/{kmer}/{id}/alignment.counts.json',
probabilities = 'data/auxiliary/
{dataset}
/kmers/{kmer}/{id}/scores.probabilistic_cov.tsv',
ratios = 'data/auxiliary/kmers/{kmer}/spaSequencesRatios.json'
output:
report('data/output/
'+config['input_folder']+'
/kmers/{kmer}/{id}_top3fit.svg',category='Coverage-Based Method',caption='../report/fitnessPlot.rst')
report('data/output/
{dataset}
/kmers/{kmer}/{id}_top3fit.svg',category='Coverage-Based Method',caption='../report/fitnessPlot.rst')
params:
dps = config['dps'],
# cluster execution
...
...
@@ -147,7 +147,7 @@ rule calcExpectedCounts:
kmerCoverageEstimate = determineKmerCoverageEstimateFile(),
counts = 'data/auxiliary/kmers/{kmer}/spaSequences.counts.json'
output:
'data/auxiliary/
'+config['input_folder']+'
/kmers/{kmer}/{id}/expected_counts.json'
'data/auxiliary/
{dataset}
/kmers/{kmer}/{id}/expected_counts.json'
params:
k = lambda wildcards: wildcards.kmer,
# cluster execution
...
...
rules/euclidean.smk
View file @
cc4d7666
rule distance:
input:
readProfile = 'data/auxiliary/
'+config['input_folder']+'
/kmers/{kmer}/{id}/alignment.profile.json',
readProfile = 'data/auxiliary/
{dataset}
/kmers/{kmer}/{id}/alignment.profile.json',
spaProfiles = 'data/auxiliary/kmers/{kmer}/spaSequences.kmerprofiles.json'
output:
'data/auxiliary/
'+config['input_folder']+'
/kmers/{kmer}/{id}/scores.euclidean.tsv'
'data/auxiliary/
{dataset}
/kmers/{kmer}/{id}/scores.euclidean.tsv'
conda:
'../envs/biopythonworkbench.yaml'
params:
...
...
rules/kmerApproach.smk
View file @
cc4d7666
# Returns the correct filename in which the required information is stored depending on the configuration setting
def determineKmerCoverageEstimateFile():
if config['kmerCoverageEstimationMethod'] == 'alignment':
return 'data/auxiliary/
'+config['input_folder']+'
/kmers/{kmer}/{id}/coverage_estimate_alignmentbased.txt'
return 'data/auxiliary/
{dataset}
/kmers/{kmer}/{id}/coverage_estimate_alignmentbased.txt'
elif config['kmerCoverageEstimationMethod'] == 'countMean':
return 'data/auxiliary/
'+config['input_folder']+'
/kmers/{kmer}/{id}/coverage_estimate_kmercountbased.txt'
return 'data/auxiliary/
{dataset}
/kmers/{kmer}/{id}/coverage_estimate_kmercountbased.txt'
elif config['kmerCoverageEstimationMethod'] == 'countPoisson':
return 'data/auxiliary/
'+config['input_folder']+'
/kmers/{kmer}/{id}/coverage_estimate_kmercountbased.txt'
return 'data/auxiliary/
{dataset}
/kmers/{kmer}/{id}/coverage_estimate_kmercountbased.txt'
# Returns the corresponding ground truth spa-type for a given file (sample) id
def getGroundTruthType(fid):
...
...
@@ -75,10 +75,10 @@ rule bwa:
input:
bwi = 'data/auxiliary/matchBoard.fa.bwt',
mb = 'data/auxiliary/matchBoard.fa',
read1 = 'data/auxiliary/
' + config['input_folder'] + '/{id}' + '.qc' + config['input_read_1_ending']
,
read2 = 'data/auxiliary/
' + config['input_folder'] + '/{id}' + '.qc' + config['input_read_2_ending']
read1 = 'data/auxiliary/
{dataset}/{id}' + '.qc_internal_R1.fq'
,
read2 = 'data/auxiliary/
{dataset}/{id}' + '.qc_internal_R2.fq'
output:
'data/auxiliary/
'+config['input_folder']+'
/{id}/alignment.bam'
'data/auxiliary/
{dataset}
/{id}/alignment.bam'
params:
#cluster execution
cpus = '1',
...
...
@@ -95,10 +95,10 @@ rule bwa:
rule determineStrandBias:
input:
alignment = 'data/auxiliary/
'+config['input_folder']+'
/{id}/alignment.sorted.bam',
idx = 'data/auxiliary/
'+config['input_folder']+'
/{id}/alignment.sorted.bam.bai'
alignment = 'data/auxiliary/
{dataset}
/{id}/alignment.sorted.bam',
idx = 'data/auxiliary/
{dataset}
/{id}/alignment.sorted.bam.bai'
output:
strandbias = report('data/output/
'+config['input_folder']+'
/{id}/strandbias.txt',category='Strand Bias')
strandbias = report('data/output/
{dataset}
/{id}/strandbias.txt',category='Strand Bias')
params:
#cluster execution
cpus = '1',
...
...
@@ -114,10 +114,10 @@ rule determineStrandBias:
rule filter_primary_matches:
input:
alignment = 'data/auxiliary/
'+config['input_folder']+'
/{id}/alignment.sorted.bam',
alignment = 'data/auxiliary/
{dataset}
/{id}/alignment.sorted.bam',
whitelist = 'data/auxiliary/syntheticProteinAs.meta'
output:
'data/auxiliary/
'+config['input_folder']+'
/{id}/alignment.filtered.bam'
'data/auxiliary/
{dataset}
/{id}/alignment.filtered.bam'
params:
#cluster execution
cpus = '1',
...
...
@@ -131,10 +131,10 @@ rule filter_primary_matches:
rule determineFilteredStrandBias:
input:
alignment = 'data/auxiliary/
'+config['input_folder']+'
/{id}/alignment.filtered.bam',
idx = 'data/auxiliary/
'+config['input_folder']+'
/{id}/alignment.filtered.bam.bai'
alignment = 'data/auxiliary/
{dataset}
/{id}/alignment.filtered.bam',
idx = 'data/auxiliary/
{dataset}
/{id}/alignment.filtered.bam.bai'
output:
strandbias = 'data/auxiliary/
'+config['input_folder']+'
/{id}/strandbias.filtered.txt'
strandbias = 'data/auxiliary/
{dataset}
/{id}/strandbias.filtered.txt'
params:
#cluster execution
cpus = '1',
...
...
@@ -148,9 +148,9 @@ rule determineFilteredStrandBias:
rule extractFilteredReadsAsFastQ:
input:
filteredAlignment = 'data/auxiliary/
'+config['input_folder']+'
/{id}/alignment.filtered.bam'
filteredAlignment = 'data/auxiliary/
{dataset}
/{id}/alignment.filtered.bam'
output:
filteredReads = 'data/auxiliary/
'+config['input_folder']+'
/{id}/filteredReads.fastq'
filteredReads = 'data/auxiliary/
{dataset}
/{id}/filteredReads.fastq'
params:
#cluster execution
cpus = '1',
...
...
@@ -167,13 +167,13 @@ rule extractFilteredReadsAsFastQ:
rule createKmerDistributionGroundTruth_COVERAGE_BASED:
input:
expectedCounts = 'data/auxiliary/
'+config['input_folder']+'
/kmers/{kmer}/{id}/expected_counts.json',
observedCounts = 'data/auxiliary/
'+config['input_folder']+'
/kmers/{kmer}/{id}/alignment.counts.json',
probabilities = 'data/auxiliary/
'+config['input_folder']+'
/kmers/{kmer}/{id}/scores.probabilistic_cov.tsv',
kmerError = 'data/auxiliary/
'+config['input_folder']+'
/kmers/{kmer}/{id}/kmer_error.txt'
expectedCounts = 'data/auxiliary/
{dataset}
/kmers/{kmer}/{id}/expected_counts.json',
observedCounts = 'data/auxiliary/
{dataset}
/kmers/{kmer}/{id}/alignment.counts.json',
probabilities = 'data/auxiliary/
{dataset}
/kmers/{kmer}/{id}/scores.probabilistic_cov.tsv',
kmerError = 'data/auxiliary/
{dataset}
/kmers/{kmer}/{id}/kmer_error.txt'
output:
errors = 'data/output/
'+config['input_folder']+'
/methodAnalysis/{kmer}/{id}/kmerErrorDistributions.svg',
deviations = 'data/output/
'+config['input_folder']+'
/methodAnalysis/{kmer}/{id}/countDeviations.svg'
errors = 'data/output/
{dataset}
/methodAnalysis/{kmer}/{id}/kmerErrorDistributions.svg',
deviations = 'data/output/
{dataset}
/methodAnalysis/{kmer}/{id}/countDeviations.svg'
params:
gt = lambda wildcards : getGroundTruthType(wildcards.id),
#cluster execution
...
...
@@ -184,35 +184,35 @@ rule createKmerDistributionGroundTruth_COVERAGE_BASED:
conda:
'../envs/biopythonworkbench.yaml'
log:
'logs/
'+config['input_folder']+'
/methodAnalysis/{kmer}/{id}/kmerErrorDistributions.svg'
'logs/
{dataset}
/methodAnalysis/{kmer}/{id}/kmerErrorDistributions.svg'
script:
'../scripts/createKmerErrorDistributionPlots.py'
rule likelihoodAnalysis_COVERAGE_BASED:
input:
expectedCounts = 'data/auxiliary/
'+config['input_folder']+'
/kmers/{kmer}/{id}/expected_counts.json',
observedCounts = 'data/auxiliary/
'+config['input_folder']+'
/kmers/{kmer}/{id}/alignment.counts.json',
probabilities = 'data/auxiliary/
'+config['input_folder']+'
/kmers/{kmer}/{id}/scores.probabilistic_cov.tsv',
kmerError = 'data/auxiliary/
'+config['input_folder']+'
/kmers/{kmer}/{id}/kmer_error.txt'
expectedCounts = 'data/auxiliary/
{dataset}
/kmers/{kmer}/{id}/expected_counts.json',
observedCounts = 'data/auxiliary/
{dataset}
/kmers/{kmer}/{id}/alignment.counts.json',
probabilities = 'data/auxiliary/
{dataset}
/kmers/{kmer}/{id}/scores.probabilistic_cov.tsv',
kmerError = 'data/auxiliary/
{dataset}
/kmers/{kmer}/{id}/kmer_error.txt'
output:
likelihoodAnalysis = 'data/output/
'+config['input_folder']+'
/methodAnalysis/{kmer}/{id}/likelihoodAnalysis.txt'
likelihoodAnalysis = 'data/output/
{dataset}
/methodAnalysis/{kmer}/{id}/likelihoodAnalysis.txt'
params:
gt = lambda wildcards : getGroundTruthType(wildcards.id)
conda:
'../envs/biopythonworkbench.yaml'
log:
'logs/
'+config['input_folder']+'
/methodAnalysis/{kmer}/{id}/likelihoodAnalysis.txt'
'logs/
{dataset}
/methodAnalysis/{kmer}/{id}/likelihoodAnalysis.txt'
script:
'../scripts/likelihoodBreakdown.py'
rule mapAgainstGroundTruth:
input:
filteredReads = 'data/auxiliary/
'+config['input_folder']+'
/{id}/filteredReads.fastq',
filteredReads = 'data/auxiliary/
{dataset}
/{id}/filteredReads.fastq',
groundTruthSequence = lambda wildcards: 'data/input/ref/'+getGroundTruthType(wildcards.id)+'.fa',
groundTruthIndex = lambda wildcards: 'data/input/ref/'+getGroundTruthType(wildcards.id)+'.fa.bwt'
output:
'data/output/
'+config['input_folder']+'
/methodAnalysis/{id}/alignmentToGroundTruthType.bam'
'data/output/
{dataset}
/methodAnalysis/{id}/alignmentToGroundTruthType.bam'
params:
# cluster execution
cpus = '1',
...
...
@@ -244,14 +244,14 @@ rule verifyUniqueness:
rule analyzeMapping:
input:
read1 = 'data/auxiliary/
'+config['input_folder']+'/{id}'+'.qc'+config['input_read_1_ending']
,
read2 = 'data/auxiliary/
'+config['input_folder']+'/{id}'+'.qc'+config['input_read_2_ending']
,
filteredAlignment = 'data/auxiliary/
'+config['input_folder']+'
/{id}/alignment.filtered.bam',
idx = 'data/auxiliary/
'+config['input_folder']+'
/{id}/alignment.filtered.bam.bai',
metaInf = lambda wildcards : 'data/input/
'+config['input_folder']+'
/syntheticReferencesMetaData/'+getGroundTruthType(wildcards.id)+'.meta'
read1 = 'data/auxiliary/
{dataset}/{id}'+'.qc_internal_R1.fq'
,
read2 = 'data/auxiliary/
{dataset}/{id}'+'.qc_internal_R2.fq'
,
filteredAlignment = 'data/auxiliary/
{dataset}
/{id}/alignment.filtered.bam',
idx = 'data/auxiliary/
{dataset}
/{id}/alignment.filtered.bam.bai',
metaInf = lambda wildcards : 'data/input/
{dataset}
/syntheticReferencesMetaData/'+getGroundTruthType(wildcards.id)+'.meta'
output:
correctAlignments = 'data/output/
'+config['input_folder']+'
/methodAnalysis/{id}/correctMapping.fa',
analysis = 'data/output/
'+config['input_folder']+'
/methodAnalysis/{id}/mapping.comparison'
correctAlignments = 'data/output/
{dataset}
/methodAnalysis/{id}/correctMapping.fa',
analysis = 'data/output/
{dataset}
/methodAnalysis/{id}/mapping.comparison'
conda:
'../envs/biopythonworkbench.yaml'
params:
...
...
@@ -265,11 +265,11 @@ rule analyzeMapping:
rule makeKmerProfilesFromTrueReads:
input:
filteredReads = 'data/output/
'+config['input_folder']+'
/methodAnalysis/{id}/correctMapping.fa',
regionXMetaData = lambda wildcards : 'data/input/
'+config['input_folder']+'
/syntheticReferencesMetaData/'+getGroundTruthType(wildcards.id)+'.meta'
filteredReads = 'data/output/
{dataset}
/methodAnalysis/{id}/correctMapping.fa',
regionXMetaData = lambda wildcards : 'data/input/
{dataset}
/syntheticReferencesMetaData/'+getGroundTruthType(wildcards.id)+'.meta'
output:
counts = 'data/output/
'+config['input_folder']+'
/methodAnalysis/{id}/{kmer}/correctCounts.json',
origins = 'data/output/
'+config['input_folder']+'
/methodAnalysis/{id}/{kmer}/kmerOrigins.json'
counts = 'data/output/
{dataset}
/methodAnalysis/{id}/{kmer}/correctCounts.json',
origins = 'data/output/
{dataset}
/methodAnalysis/{id}/{kmer}/kmerOrigins.json'
params:
k = lambda wildcards: wildcards.kmer
conda:
...
...
@@ -279,10 +279,10 @@ rule makeKmerProfilesFromTrueReads:
rule compareObservedKmerProfileToTrueProfile:
input:
trueCounts = 'data/output/
'+config['input_folder']+'
/methodAnalysis/{id}/{kmer}/correctCounts.json',
observedCounts = 'data/auxiliary/
'+config['input_folder']+'
/kmers/{kmer}/{id}/alignment.counts.json'
trueCounts = 'data/output/
{dataset}
/methodAnalysis/{id}/{kmer}/correctCounts.json',
observedCounts = 'data/auxiliary/
{dataset}
/kmers/{kmer}/{id}/alignment.counts.json'
output:
differences = 'data/output/
'+config['input_folder']+'
/methodAnalysis/{id}/{kmer}/differences_observed.txt'
differences = 'data/output/
{dataset}
/methodAnalysis/{id}/{kmer}/differences_observed.txt'
conda:
'../envs/biopythonworkbench.yaml'
script:
...
...
@@ -290,11 +290,11 @@ rule compareObservedKmerProfileToTrueProfile:
rule compareExpectedKmerProfileToObserved:
input:
trueCounts = 'data/auxiliary/
'+config['input_folder']+'
/kmers/{kmer}/{id}/alignment.counts.json',
expectedCounts = 'data/auxiliary/
'+config['input_folder']+'
/kmers/{kmer}/{id}/expected_counts.json',
trueCounts = 'data/auxiliary/
{dataset}
/kmers/{kmer}/{id}/alignment.counts.json',
expectedCounts = 'data/auxiliary/
{dataset}
/kmers/{kmer}/{id}/expected_counts.json',
groundTruthFile = 'data/input/' + config['ground_truth']
output:
differences = 'data/output/
'+config['input_folder']+'
/methodAnalysis/{id}/{kmer}/differences_observed_expected.txt'
differences = 'data/output/
{dataset}
/methodAnalysis/{id}/{kmer}/differences_observed_expected.txt'
params:
inputFileID = lambda wildcards: wildcards.id
conda:
...
...
@@ -324,9 +324,9 @@ rule makeSequenceProfiles:
rule calcAverageReadLength:
input:
read1 =
'data/input/
'+config['
input_folder']+'/{id}'+config
['input_read_1_ending']
read1 =
lambda wildcards: 'data/input/{dataset}/{id}
'+
config['
datasets'][wildcards.dataset]
['input_read_1_ending']
output:
'data/auxiliary/
'+config['input_folder']+'
/{id}/readLengthEstimate.txt'
'data/auxiliary/
{dataset}
/{id}/readLengthEstimate.txt'
conda:
'../envs/biopythonworkbench.yaml'
shell:
...
...
@@ -335,17 +335,17 @@ rule calcAverageReadLength:
rule createSpaTypeVennDiagram:
input:
expected = 'data/auxiliary/
'+config['input_folder']+'
/kmers/{kmer}/{id}/expected_counts.json',
observed = 'data/auxiliary/
'+config['input_folder']+'
/kmers/{kmer}/{id}/alignment.counts.json',
scores = 'data/auxiliary/
'+config['input_folder']+'
/kmers/{kmer}/{id}/scores.probabilistic_cov.tsv'
expected = 'data/auxiliary/
{dataset}
/kmers/{kmer}/{id}/expected_counts.json',
observed = 'data/auxiliary/
{dataset}
/kmers/{kmer}/{id}/alignment.counts.json',
scores = 'data/auxiliary/
{dataset}
/kmers/{kmer}/{id}/scores.probabilistic_cov.tsv'
output:
venngtt = 'data/output/
'+config['input_folder']+'
/kmers/{kmer}/{id}/spaTypesGroundTruthVennDia.svg',
venntopsix = 'data/output/
'+config['input_folder']+'
/kmers/{kmer}/{id}/spaTypesTopSixVennDia.svg',
vennrandomsix = 'data/output/
'+config['input_folder']+'
/kmers/{kmer}/{id}/spaTypesRandomSixVennDia.svg'
venngtt = 'data/output/
{dataset}
/kmers/{kmer}/{id}/spaTypesGroundTruthVennDia.svg',
venntopsix = 'data/output/
{dataset}
/kmers/{kmer}/{id}/spaTypesTopSixVennDia.svg',
vennrandomsix = 'data/output/
{dataset}
/kmers/{kmer}/{id}/spaTypesRandomSixVennDia.svg'
params:
gtt = lambda wildcards : getGroundTruthType(wildcards.id)
log:
'logs/
'+config['input_folder']+'
/probabilistic/kmers/{kmer}/{id}/spaTypeVennDia.log'
'logs/
{dataset}
/probabilistic/kmers/{kmer}/{id}/spaTypeVennDia.log'
conda:
'../envs/biopythonworkbench.yaml'
script:
...
...
@@ -354,15 +354,15 @@ rule createSpaTypeVennDiagram:
rule createValidKmerHistogram:
input:
expected = 'data/auxiliary/
'+config['input_folder']+'
/kmers/{kmer}/{id}/expected_counts.json',
observed = 'data/auxiliary/
'+config['input_folder']+'
/kmers/{kmer}/{id}/alignment.counts.json'
expected = 'data/auxiliary/
{dataset}
/kmers/{kmer}/{id}/expected_counts.json',
observed = 'data/auxiliary/
{dataset}
/kmers/{kmer}/{id}/alignment.counts.json'
output:
histogram = 'data/auxiliary/
'+config['input_folder']+'
/kmers/{kmer}/{id}/validKmersHisto.svg'
histogram = 'data/auxiliary/
{dataset}
/kmers/{kmer}/{id}/validKmersHisto.svg'
params:
gtt = lambda wildcards : getGroundTruthType(wildcards.id)
log:
'logs/
'+config['input_folder']+'
/probabilistic/kmers/{kmer}/{id}/validKmersHisto.log'
'logs/
{dataset}
/probabilistic/kmers/{kmer}/{id}/validKmersHisto.log'
conda:
'../envs/biopythonworkbench.yaml'
script:
...
...
@@ -371,13 +371,13 @@ rule createValidKmerHistogram:
rule calcKmerErrorRates:
input:
baseError = 'data/auxiliary/
'+config['input_folder']+'
/{id}/base_error_estimate.txt'
baseError = 'data/auxiliary/
{dataset}
/{id}/base_error_estimate.txt'
output:
error = 'data/auxiliary/
'+config['input_folder']+'
/kmers/{kmer}/{id}/error_estimate.txt'
error = 'data/auxiliary/
{dataset}
/kmers/{kmer}/{id}/error_estimate.txt'
params:
k = lambda wildcards: wildcards.kmer
log:
'logs/
'+config['input_folder']+'
/kmers/{kmer}/{id}/calcKmerErrorRates.log'
'logs/
{dataset}
/kmers/{kmer}/{id}/calcKmerErrorRates.log'
conda:
'../envs/biopythonworkbench.yaml'
script:
...
...
@@ -387,12 +387,12 @@ rule calcKmerErrorRates:
if config['skipMapping']:
rule makeReadProfiles:
input:
expectedCounts = 'data/auxiliary/
'+config['input_folder']+'
/kmers/{kmer}/{id}/expected_counts.json',
read1 = 'data/auxiliary/
' + config['input_folder'] + '/{id}' + '.qc' + config['input_read_1_ending']
,
read2 = 'data/auxiliary/
' + config['input_folder'] + '/{id}' + '.qc' + config['input_read_2_ending']
expectedCounts = 'data/auxiliary/
{dataset}
/kmers/{kmer}/{id}/expected_counts.json',
read1 = 'data/auxiliary/
{dataset}/{id}' + '.qc_internal_R1.fq'
,
read2 = 'data/auxiliary/
{dataset}/{id}' + '.qc_internal_R2.fq'
output:
profile = 'data/auxiliary/
'+config['input_folder']+'
/kmers/{kmer}/{id}/alignment.profile.json',
counts = 'data/auxiliary/
'+config['input_folder']+'
/kmers/{kmer}/{id}/alignment.counts.json'
profile = 'data/auxiliary/
{dataset}
/kmers/{kmer}/{id}/alignment.profile.json',
counts = 'data/auxiliary/
{dataset}
/kmers/{kmer}/{id}/alignment.counts.json'
params:
k = lambda wildcards: wildcards.kmer,
#cluster execution
...
...
@@ -401,7 +401,7 @@ if config['skipMapping']:
gpus = '0',
walltime = '00:30:00'
log:
'logs/
'+config['input_folder']+'
/kmers/{kmer}/{id}/makeReadProfiles.log'
'logs/
{dataset}
/kmers/{kmer}/{id}/makeReadProfiles.log'
conda:
'../envs/biopythonworkbench.yaml'
script:
...
...
@@ -410,15 +410,15 @@ if config['skipMapping']:
else:
rule makeReadProfiles:
input:
alignment = 'data/auxiliary/
'+config['input_folder']+'
/{id}/alignment.filtered.bam',
read1 = 'data/auxiliary/
' + config['input_folder'] + '/{id}' + '.qc' + config['input_read_1_ending']
,
read2 = 'data/auxiliary/
' + config['input_folder'] + '/{id}' + '.qc' + config['input_read_2_ending']
,
index = 'data/auxiliary/
'+config['input_folder']+'
/{id}/alignment.filtered.bam.bai', #ghost input
alignment = 'data/auxiliary/
{dataset}
/{id}/alignment.filtered.bam',
read1 = 'data/auxiliary/
{dataset}/{id}' + '.qc_internal_R1.fq'
,
read2 = 'data/auxiliary/
{dataset}/{id}' + '.qc_internal_R2.fq'
,
index = 'data/auxiliary/
{dataset}
/{id}/alignment.filtered.bam.bai', #ghost input
regions = 'data/auxiliary/syntheticProteinAs.meta'
output:
profile = 'data/auxiliary/
'+config['input_folder']+'
/kmers/{kmer}/{id}/alignment.profile.json',
counts = 'data/auxiliary/
'+config['input_folder']+'
/kmers/{kmer}/{id}/alignment.counts.json',
debug = 'data/auxiliary/
'+config['input_folder']+'
/kmers/{kmer}/{id}/alignmentExtraction.txt'
profile = 'data/auxiliary/
{dataset}
/kmers/{kmer}/{id}/alignment.profile.json',
counts = 'data/auxiliary/
{dataset}
/kmers/{kmer}/{id}/alignment.counts.json',
debug = 'data/auxiliary/
{dataset}
/kmers/{kmer}/{id}/alignmentExtraction.txt'
#local_coverage_estimate = 'data/auxiliary/kmers/{kmer}/{id}/local_coverage_estimate.txt'
params:
k = lambda wildcards: wildcards.kmer,
...
...
@@ -428,7 +428,7 @@ else:
gpus = '0',
walltime = '00:30:00'
log:
'logs/
'+config['input_folder']+'
/kmers/{kmer}/{id}/makeReadProfiles.log'
'logs/
{dataset}
/kmers/{kmer}/{id}/makeReadProfiles.log'
conda:
'../envs/biopythonworkbench.yaml'
script:
...
...
rules/probabilistic.smk
View file @
cc4d7666
rule compareExpectedKmerProfileToTrueProfile:
input:
trueCounts = 'data/output/
'+config['input_folder']+'
/methodAnalysis/{id}/{kmer}/correctCounts.json',
expectedCounts = 'data/auxiliary/
'+config['input_folder']+'
/kmers/{kmer}/{id}/expected_counts.json',
trueCounts = 'data/output/
{dataset}
/methodAnalysis/{id}/{kmer}/correctCounts.json',
expectedCounts = 'data/auxiliary/
{dataset}
/kmers/{kmer}/{id}/expected_counts.json',
groundTruthFile = 'data/input/' + config['ground_truth']
output:
differences = 'data/output/
'+config['input_folder']+'
/methodAnalysis/{id}/{kmer}/differences_expected.txt'
differences = 'data/output/
{dataset}
/methodAnalysis/{id}/{kmer}/differences_expected.txt'
params:
inputFileID = lambda wildcards: wildcards.id
conda:
...
...
@@ -14,9 +14,9 @@ rule compareExpectedKmerProfileToTrueProfile:
rule calcPriorProbabilities:
input:
likelihoods = 'data/auxiliary/
'+config['input_folder']+'
/kmers/{kmer}/{id}/likelihoods.json'
likelihoods = 'data/auxiliary/
{dataset}
/kmers/{kmer}/{id}/likelihoods.json'
output:
priorFilePath = 'data/auxiliary/
'+config['input_folder']+'
/kmers/{kmer}/{id}/prior.txt'
priorFilePath = 'data/auxiliary/
{dataset}
/kmers/{kmer}/{id}/prior.txt'
params:
k = lambda wildcards: wildcards.kmer,
dps = config['dps'],
...
...
@@ -27,17 +27,17 @@ rule calcPriorProbabilities:
conda:
'../envs/biopythonworkbench.yaml'
log:
'logs/
'+config['input_folder']+'
/probabilistic/kmers/{kmer}/{id}/calcPrior.log'
'logs/
{dataset}
/probabilistic/kmers/{kmer}/{id}/calcPrior.log'
script:
'../scripts/calcPriorProbabilities.py'
rule calcProbabilities:
input:
likelihoods = 'data/auxiliary/
'+config['input_folder']+'
/kmers/{kmer}/{id}/likelihoods.json',
prior = 'data/auxiliary/
'+config['input_folder']+'
/kmers/{kmer}/{id}/prior.txt'
likelihoods = 'data/auxiliary/
{dataset}
/kmers/{kmer}/{id}/likelihoods.json',
prior = 'data/auxiliary/
{dataset}
/kmers/{kmer}/{id}/prior.txt'
<