Commit ef1f913a authored by Jan Hoeckesfeld's avatar Jan Hoeckesfeld
Browse files

Merge branch 'iterationset-tests-experimental' into hilbert-hpc

parents 558ac3d3 7491e4d7
......@@ -18,4 +18,6 @@ config.yaml
.snakemake/
# singularity
*.simg
\ No newline at end of file
*.simg
*.sif
hilbert/
......@@ -9,49 +9,71 @@ validate(config, "schemas/config.schema.yaml")
#Generate Input/Output Files from specified folder
inputIDs, = glob_wildcards('data/input/'+config['input_folder']+'/{id}'+config['input_read_1_ending'])
kmer_lengths = config['kmers']
kmer_lengths = config['kmers']
dataset_inputIDs = {}
#kmer_lengths = [24]
def get_general_inputs(wildcards):
run_dir = wildcards.dataset
inputIDs, = glob_wildcards('data/input/'+wildcards.dataset+'/{id}'+config["datasets"][wildcards.dataset]['input_read_1_ending'])
dataset_inputIDs[wildcards.dataset] = inputIDs
possible_params = {
'assembly_model': expand('data/output/'+run_dir+'/{id}/exactMatches.tsv',id=inputIDs),
'calc_strand_bias': expand('data/output/'+run_dir+'/{id}/strandbias.txt',id=inputIDs),
'mapping_diff_analysis': expand('data/output/'+run_dir+'/methodAnalysis/{id}/mapping.comparison',id=inputIDs),
'map_filtered_reads': expand('data/output/'+run_dir+'/methodAnalysis/{id}/alignmentToGroundTruthType.sorted.bam.bai',id=inputIDs),
'verifyUniqueness': expand('data/output/kmers/{kmer}/uniquenessTest.tsv',kmer=kmer_lengths),
'kmer_stats_analysis': expand('data/auxiliary/'+run_dir+'/kmers/{kmer}/{id}/stats.tsv',kmer=kmer_lengths,id=inputIDs)
}
return [item for k in possible_params.keys() if config[k] for item in possible_params[k]]
#Helper function that assembles required input files dependent on configuration settings
def get_input():
input_list = []
if config['generative_model']:
input_list.append(expand('data/output/'+config['input_folder']+'/kmers/{kmer}/predictions.probabilistic_gen.tsv',kmer=kmer_lengths))
if config['probabilistic_model']:
input_list.append(expand('data/output/'+config['input_folder']+'/kmers/{kmer}/predictions.probabilistic_cov.tsv',kmer=kmer_lengths))
if config['plot_top3_fit']:
input_list.append(expand('data/output/'+config['input_folder']+'/kmers/{kmer}/{id}_top3fit.svg',kmer=kmer_lengths,id=inputIDs))
if config['distance_model']:
input_list.append(expand('data/output/'+config['input_folder']+'/kmers/{kmer}/predictions.euclidean.tsv',kmer=kmer_lengths))
if config['assembly_model']:
input_list.append(expand('data/output/'+config['input_folder']+'/{id}/exactMatches.tsv',id=inputIDs))
if config['calc_strand_bias']:
input_list.append(expand('data/output/'+config['input_folder']+'/{id}/strandbias.txt',id=inputIDs))
if config['mapping_diff_analysis']:
input_list.append(expand('data/output/'+config['input_folder']+'/methodAnalysis/{id}/mapping.comparison',id=inputIDs))
if config['map_filtered_reads']:
input_list.append(expand('data/output/'+config['input_folder']+'/methodAnalysis/{id}/alignmentToGroundTruthType.sorted.bam.bai',id=inputIDs))
if config['verifyUniqueness']:
input_list.append(expand('data/output/kmers/{kmer}/uniquenessTest.tsv',kmer=kmer_lengths))
if config['kmer_stats_analysis']:
input_list.append(expand('data/auxiliary/'+config['input_folder']+'/kmers/{kmer}/{id}/stats.tsv',kmer=kmer_lengths,id=inputIDs))
input_list.append(expand('data/output/'+config['input_folder']+'/kmers/{kmer}/{id}/spaTypesGroundTruthVennDia.svg',kmer=kmer_lengths,id=inputIDs))
return input_list
def get_iterset_inputs(wildcards):
inputIDs, = glob_wildcards('data/input/'+wildcards.dataset+'/{id}'+config["datasets"][wildcards.dataset]['input_read_1_ending'])
possible_params = {
'generative_model': expand('data/output/' + wildcards.dataset +'/' + wildcards.iterset +'/kmers/{kmer}/predictions.probabilistic_gen.tsv',kmer=kmer_lengths),
'distance_model': expand('data/output/' + wildcards.dataset +'/' + wildcards.iterset +'/kmers/{kmer}/predictions.euclidean.tsv',kmer=kmer_lengths),
'probabilistic_model': expand('data/output/' + wildcards.dataset +'/' + wildcards.iterset + '/kmers/{kmer}/predictions.probabilistic_cov.tsv',kmer=kmer_lengths),
# if above:
'plot_top3_fit': expand('data/output/' + wildcards.dataset +'/' + wildcards.iterset + '/kmers/{kmer}/{id}_top3fit.svg',kmer=kmer_lengths,id=inputIDs),
'kmer_stats_analysis': expand('data/output/' + wildcards.dataset + '/' + wildcards.iterset +'/kmers/{kmer}/{id}/spaTypesGroundTruthVennDia.svg',kmer=kmer_lengths,id=inputIDs)
}
return [item for k in possible_params.keys() if config[k] for item in possible_params[k]]
def use_itersets():
if config['probabilistic_model'] and config['itersets']:
return config['itersets']
return ['O']
rule all:
input:
get_input()
run_datasets = expand('data/output/{dataset}_summary.md', dataset=config['datasets'].keys())
rule run_dataset:
input:
general = get_general_inputs,
summarys = expand('data/auxiliary/{dataset}/{iterset}_summary.md', iterset=use_itersets(), allow_missing=True)
output:
summary = 'data/output/{dataset}_summary.md'
# TODO create summary
shell:
'touch {output.out}'
rule run_iterset:
input:
get_iterset_inputs
output:
out = 'data/auxiliary/{dataset}/{iterset}_summary.md'
# TODO create summary
shell:
'touch {output.out}'
##### load rules #####
include: "rules/assembly.snk"
include: "rules/shared.snk"
include: "rules/kmerApproach.snk"
include: "rules/coverageBased.snk"
include: "rules/probabilistic.snk"
include: "rules/euclidean.snk"
include: "rules/assembly.smk"
include: "rules/shared.smk"
include: "rules/kmerApproach.smk"
include: "rules/coverageBased.smk"
include: "rules/probabilistic.smk"
include: "rules/euclidean.smk"
......@@ -43,9 +43,13 @@ protein_a_identifier : protein A #NCTC8325
l_lactate_permease_identifier : L-lactate permease #NCTC8325
arlp_identifier : accessory regulator-like protein #NCTC8325
input_folder : test
input_read_1_ending : _1.fq
input_read_2_ending : _2.fq
datasets:
test:
input_read_1_ending : _1.fq
input_read_2_ending : _2.fq
test2:
input_read_1_ending : _R1.fastq
input_read_2_ending : _R2.fastq
### Method Analysis
......@@ -90,7 +94,7 @@ deviationCutoff : 2.5
skipMapping: False
plot_top3_fit : False
#choose the iterationset either O, V, OuV (O union V) or OnV (O intersect V)
itersetType : O
itersets: [O,V,OnV]
###Blast Parameter
blast_word_size : 4
......
rule spades:
input:
read1 = 'data/auxiliary/' + config['input_folder'] + '/{id}' + '.qc' + config['input_read_1_ending'],
read2 = 'data/auxiliary/' + config['input_folder'] + '/{id}' + '.qc' + config['input_read_2_ending']
read1 = 'data/auxiliary/{dataset}/{id}' + '.qc_internal_R1.fq',
read2 = 'data/auxiliary/{dataset}/{id}' + '.qc_internal_R2.fq'
output:
directory('data/auxiliary/' + config['input_folder'] + '/{id}/spades')
directory('data/auxiliary/{dataset}/{id}/spades')
singularity:
'docker://pegi3s/spades:latest'
shell:
......@@ -13,12 +13,12 @@ rule spades:
rule exactMatch:
input:
infolder = 'data/auxiliary/' + config['input_folder'] + '/{id}/spades'
infolder = 'data/auxiliary/{dataset}/{id}/spades'
output:
'data/output/' + config['input_folder'] + '/{id}/exactMatches.tsv'
'data/output/{dataset}/{id}/exactMatches.tsv'
params:
spaSeqs = 'data/auxiliary/spaSequences.fa',
scaffolds = 'data/auxiliary/' + config['input_folder'] + '/{id}/spades/scaffolds.fasta'
scaffolds = 'data/auxiliary/{dataset}/{id}/spades/scaffolds.fasta'
conda:
'../envs/biopythonworkbench.yaml'
script:
......
rule estimateKmerCoverage:
input:
read1 = 'data/auxiliary/'+config['input_folder']+'/{id}'+'.qc'+config['input_read_1_ending'],
read2 = 'data/auxiliary/'+config['input_folder']+'/{id}'+'.qc'+config['input_read_2_ending']
read1 = 'data/auxiliary/{dataset}/{id}'+'.qc_internal_R1.fq',
read2 = 'data/auxiliary/{dataset}/{id}'+'.qc_internal_R2.fq'
output:
histogram = report('data/output/' + config['input_folder'] + '/kmers/{kmer}/{id}/kmers.histo.png',category='Coverage-Based Method',caption='../report/kmerCoverageHistogram.rst'),
histogramRaw = 'data/auxiliary/' + config['input_folder'] + '/kmers/{kmer}/{id}/kmers.histo.raw.png',
mean = 'data/auxiliary/' + config['input_folder'] + '/kmers/{kmer}/{id}/coverage_estimate_kmercountbased.txt',
kmererror = 'data/auxiliary/' + config['input_folder'] + '/kmers/{kmer}/{id}/kmer_error.txt'
histogram = report('data/output/{dataset}/kmers/{kmer}/{id}/kmers.histo.png',category='Coverage-Based Method',caption='../report/kmerCoverageHistogram.rst'),
histogramRaw = 'data/auxiliary/{dataset}/kmers/{kmer}/{id}/kmers.histo.raw.png',
mean = 'data/auxiliary/{dataset}/kmers/{kmer}/{id}/coverage_estimate_kmercountbased.txt',
kmererror = 'data/auxiliary/{dataset}/kmers/{kmer}/{id}/kmer_error.txt'
params:
k = lambda wildcards: wildcards.kmer,
#cluster execution
cpus = '1', #TODO: This could in theory be sped up significantly using a shared cache and multithreading
gpus = '0',
mem = '20G',
walltime = '00:30:00'
mem = '64G',
walltime = '00:45:00'
log:
'logs/' + config['input_folder'] + '/kmers/{kmer}/{id}/estimateKmerCoverage.log'
'logs/{dataset}/kmers/{kmer}/{id}/estimateKmerCoverage.log'
benchmark:
'benchmarks/' + config['input_folder'] + '/kmers/{kmer}/{id}/estimateKmerCoverage.log'
'benchmarks/{dataset}/kmers/{kmer}/{id}/estimateKmerCoverage.log'
conda:
'../envs/biopythonworkbench.yaml'
script:
......@@ -27,14 +27,14 @@ rule estimateKmerCoverage:
'''
rule estimateKmerCoverageFiltered:
input:
reads = 'data/auxiliary/'+config['input_folder']+'/{id}/filteredReads.fastq'
reads = 'data/auxiliary/{dataset}/{id}/filteredReads.fastq'
output:
histogram = 'data/auxiliary/'+config['input_folder']+'/kmers/{kmer}/{id}/kmers.histo.regionXOnly.png'
histogram = 'data/auxiliary/{dataset}/kmers/{kmer}/{id}/kmers.histo.regionXOnly.png'
params:
k = lambda wildcards: wildcards.kmer
#TODO: Threads = 2 ?
log:
'logs/'+config['input_folder']+'/kmers/{kmer}/{id}/estimateKmerCoverageFiltered.log'
'logs/{dataset}/kmers/{kmer}/{id}/estimateKmerCoverageFiltered.log'
conda:
'../envs/main.yaml'
script:
......@@ -44,11 +44,11 @@ rule estimateKmerCoverageFiltered:
rule estimateKmerCoverageAlignment:
input:
coverageEstimate = 'data/auxiliary/'+config['input_folder']+'/{id}/coverageEstimate.txt',
readLengthEstimate = 'data/auxiliary/'+config['input_folder']+'/{id}/readLengthEstimate.txt',
baseErrorEstimate = 'data/auxiliary/'+config['input_folder']+'/{id}/base_error_estimate.txt'
coverageEstimate = 'data/auxiliary/{dataset}/{id}/coverageEstimate.txt',
readLengthEstimate = 'data/auxiliary/{dataset}/{id}/readLengthEstimate.txt',
baseErrorEstimate = 'data/auxiliary/{dataset}/{id}/base_error_estimate.txt'
output:
kmerCoverage = 'data/auxiliary/'+config['input_folder']+'/kmers/{kmer}/{id}/coverage_estimate_alignmentbased.txt'
kmerCoverage = 'data/auxiliary/{dataset}/kmers/{kmer}/{id}/coverage_estimate_alignmentbased.txt'
params:
k = lambda wildcards: wildcards.kmer,
# cluster execution
......@@ -57,7 +57,7 @@ rule estimateKmerCoverageAlignment:
mem = '16G',
walltime = '00:30:30'
log:
'logs/'+config['input_folder']+'/kmers/{kmer}/{id}/estimateKmerCoverage.log'
'logs/{dataset}/kmers/{kmer}/{id}/estimateKmerCoverage.log'
conda:
'../envs/biopythonworkbench.yaml'
script:
......@@ -65,9 +65,9 @@ rule estimateKmerCoverageAlignment:
rule estimateCoverageAlignment:
input:
filteredAlignment = 'data/auxiliary/'+config['input_folder']+'/{id}/alignment.sorted.bam'
filteredAlignment = 'data/auxiliary/{dataset}/{id}/alignment.sorted.bam'
output:
coverageEstimate = 'data/auxiliary/'+config['input_folder']+'/{id}/coverageEstimate.txt'
coverageEstimate = 'data/auxiliary/{dataset}/{id}/coverageEstimate.txt'
params:
# cluster execution
cpus = '1',
......@@ -75,7 +75,7 @@ rule estimateCoverageAlignment:
mem = '16G',
walltime = '00:30:30'
log:
'logs/'+config['input_folder']+'/{id}/estimateKmerCoverage_alignment.log'
'logs/{dataset}/{id}/estimateKmerCoverage_alignment.log'
conda:
'../envs/biopythonworkbench.yaml'
shell:
......@@ -83,9 +83,9 @@ rule estimateCoverageAlignment:
rule calcPriorProbabilitiesCoverage:
input:
likelihoods = 'data/auxiliary/'+config['input_folder']+'/kmers/{kmer}/{id}/likelihoods_cov.json'
likelihoods = 'data/auxiliary/{dataset}/{iterset}/kmers/{kmer}/{id}/likelihoods_cov.json'
output:
priorFilePath = 'data/auxiliary/'+config['input_folder']+'/kmers/{kmer}/{id}/prior_cov.txt'
priorFilePath = 'data/auxiliary/{dataset}/{iterset}/kmers/{kmer}/{id}/prior_cov.txt'
params:
k = lambda wildcards: wildcards.kmer,
dps = config['dps'],
......@@ -95,7 +95,7 @@ rule calcPriorProbabilitiesCoverage:
mem = '2G',
walltime = '00:05:30'
log:
'logs/'+config['input_folder']+'/probabilistic/kmers/{kmer}/{id}/calcPrior_cov.log'
'logs/{dataset}/{iterset}/probabilistic/kmers/{kmer}/{id}/calcPrior_cov.log'
conda:
'../envs/biopythonworkbench.yaml'
script:
......@@ -103,10 +103,10 @@ rule calcPriorProbabilitiesCoverage:
rule calcProbabilitiesCoverage:
input:
likelihoods = 'data/auxiliary/'+config['input_folder']+'/kmers/{kmer}/{id}/likelihoods_cov.json',
prior = 'data/auxiliary/'+config['input_folder']+'/kmers/{kmer}/{id}/prior_cov.txt'
likelihoods = 'data/auxiliary/{dataset}/{iterset}/kmers/{kmer}/{id}/likelihoods_cov.json',
prior = 'data/auxiliary/{dataset}/{iterset}/kmers/{kmer}/{id}/prior_cov.txt'
output:
probabilities = 'data/auxiliary/'+config['input_folder']+'/kmers/{kmer}/{id}/scores.probabilistic_cov.tsv'
probabilities = 'data/auxiliary/{dataset}/{iterset}/kmers/{kmer}/{id}/scores.probabilistic_cov.tsv'
params:
dps = config['dps'],
# cluster execution
......@@ -117,18 +117,18 @@ rule calcProbabilitiesCoverage:
conda:
'../envs/biopythonworkbench.yaml'
log:
'logs/'+config['input_folder']+'/probabilistic/kmers/{kmer}/{id}/probabilities_cov.log'
'logs/{dataset}/{iterset}/probabilistic/kmers/{kmer}/{id}/probabilities_cov.log'
script:
'../scripts/calcSpaTypeProbabilities.py'
rule createFitnessPlots:
input:
counts = 'data/auxiliary/'+config['input_folder']+'/kmers/{kmer}/{id}/alignment.counts.json',
probabilities = 'data/auxiliary/'+config['input_folder']+'/kmers/{kmer}/{id}/scores.probabilistic_cov.tsv',
counts = 'data/auxiliary/{dataset}/kmers/{kmer}/{id}/alignment.counts.json',
probabilities = 'data/auxiliary/{dataset}/{iterset}/kmers/{kmer}/{id}/scores.probabilistic_cov.tsv',
ratios = 'data/auxiliary/kmers/{kmer}/spaSequencesRatios.json'
output:
report('data/output/'+config['input_folder']+'/kmers/{kmer}/{id}_top3fit.svg',category='Coverage-Based Method',caption='../report/fitnessPlot.rst')
report('data/output/{dataset}/{iterset}/kmers/{kmer}/{id}_top3fit.svg',category='Coverage-Based Method',caption='../report/fitnessPlot.rst')
params:
dps = config['dps'],
# cluster execution
......@@ -147,7 +147,7 @@ rule calcExpectedCounts:
kmerCoverageEstimate = determineKmerCoverageEstimateFile(),
counts = 'data/auxiliary/kmers/{kmer}/spaSequences.counts.json'
output:
'data/auxiliary/'+config['input_folder']+'/kmers/{kmer}/{id}/expected_counts.json'
'data/auxiliary/{dataset}/kmers/{kmer}/{id}/expected_counts.json'
params:
k = lambda wildcards: wildcards.kmer,
# cluster execution
......
rule distance:
input:
readProfile = 'data/auxiliary/'+config['input_folder']+'/kmers/{kmer}/{id}/alignment.profile.json',
readProfile = 'data/auxiliary/{dataset}/kmers/{kmer}/{id}/alignment.profile.json',
spaProfiles = 'data/auxiliary/kmers/{kmer}/spaSequences.kmerprofiles.json'
output:
'data/auxiliary/'+config['input_folder']+'/kmers/{kmer}/{id}/scores.euclidean.tsv'
'data/auxiliary/{dataset}/{iterset}/kmers/{kmer}/{id}/scores.euclidean.tsv'
conda:
'../envs/biopythonworkbench.yaml'
params:
......
rule compareExpectedKmerProfileToTrueProfile:
input:
trueCounts = 'data/output/'+config['input_folder']+'/methodAnalysis/{id}/{kmer}/correctCounts.json',
expectedCounts = 'data/auxiliary/'+config['input_folder']+'/kmers/{kmer}/{id}/expected_counts.json',
trueCounts = 'data/output/{dataset}/methodAnalysis/{id}/{kmer}/correctCounts.json',
expectedCounts = 'data/auxiliary/{dataset}/kmers/{kmer}/{id}/expected_counts.json',
groundTruthFile = 'data/input/' + config['ground_truth']
output:
differences = 'data/output/'+config['input_folder']+'/methodAnalysis/{id}/{kmer}/differences_expected.txt'
differences = 'data/output/{dataset}/methodAnalysis/{id}/{kmer}/differences_expected.txt'
params:
inputFileID = lambda wildcards: wildcards.id
conda:
......@@ -14,9 +14,9 @@ rule compareExpectedKmerProfileToTrueProfile:
rule calcPriorProbabilities:
input:
likelihoods = 'data/auxiliary/'+config['input_folder']+'/kmers/{kmer}/{id}/likelihoods.json'
likelihoods = 'data/auxiliary/{dataset}/kmers/{kmer}/{id}/likelihoods.json'
output:
priorFilePath = 'data/auxiliary/'+config['input_folder']+'/kmers/{kmer}/{id}/prior.txt'
priorFilePath = 'data/auxiliary/{dataset}/kmers/{kmer}/{id}/prior.txt'
params:
k = lambda wildcards: wildcards.kmer,
dps = config['dps'],
......@@ -27,17 +27,17 @@ rule calcPriorProbabilities:
conda:
'../envs/biopythonworkbench.yaml'
log:
'logs/'+config['input_folder']+'/probabilistic/kmers/{kmer}/{id}/calcPrior.log'
'logs/{dataset}/probabilistic/kmers/{kmer}/{id}/calcPrior.log'
script:
'../scripts/calcPriorProbabilities.py'
rule calcProbabilities:
input:
likelihoods = 'data/auxiliary/'+config['input_folder']+'/kmers/{kmer}/{id}/likelihoods.json',
prior = 'data/auxiliary/'+config['input_folder']+'/kmers/{kmer}/{id}/prior.txt'
likelihoods = 'data/auxiliary/{dataset}/kmers/{kmer}/{id}/likelihoods.json',
prior = 'data/auxiliary/{dataset}/kmers/{kmer}/{id}/prior.txt'
output:
probabilities = 'data/auxiliary/'+config['input_folder']+'/kmers/{kmer}/{id}/scores.probabilistic_gen.tsv'
probabilities = 'data/auxiliary/{dataset}/{iterset}/kmers/{kmer}/{id}/scores.probabilistic_gen.tsv'
params:
dps = config['dps'],
cpus = '1',
......@@ -47,7 +47,7 @@ rule calcProbabilities:
conda:
'../envs/biopythonworkbench.yaml'
log:
'logs/'+config['input_folder']+'/probabilistic/kmers/{kmer}/{id}/probabilities.log'
'logs/{dataset}/{iterset}/probabilistic/kmers/{kmer}/{id}/probabilities.log'
script:
'../scripts/calcSpaTypeProbabilities.py'
......@@ -62,31 +62,32 @@ def extractTsvValue(filePath,line,nolabels=False):
rule calcLikelihoods:
input:
expected = 'data/auxiliary/'+config['input_folder']+'/kmers/{kmer}/{id}/expected_counts.json',
observed = 'data/auxiliary/'+config['input_folder']+'/kmers/{kmer}/{id}/alignment.counts.json',
kmerError = 'data/auxiliary/'+config['input_folder']+'/kmers/{kmer}/{id}/kmer_error.txt',
expected = 'data/auxiliary/{dataset}/kmers/{kmer}/{id}/expected_counts.json',
observed = 'data/auxiliary/{dataset}/kmers/{kmer}/{id}/alignment.counts.json',
kmerError = 'data/auxiliary/{dataset}/kmers/{kmer}/{id}/kmer_error.txt',
kmerCoverageEstimate = determineKmerCoverageEstimateFile(),
V_kmers_distances = 'data/auxiliary/kmers/{kmer}/V_kmers.distances.npz',
V_kmers = 'data/auxiliary/kmers/{kmer}/V_kmers.json'
output:
likelihoods = 'data/auxiliary/'+config['input_folder']+'/kmers/{kmer}/{id}/likelihoods_cov.json',
unexpectedLikelihoods = 'data/auxiliary/'+config['input_folder']+'/kmers/{kmer}/{id}/unexpected_likelihoods_cov.json'
likelihoods = 'data/auxiliary/{dataset}/{iterset}/kmers/{kmer}/{id}/likelihoods_cov.json',
unexpectedLikelihoods = 'data/auxiliary/{dataset}/{iterset}/kmers/{kmer}/{id}/unexpected_likelihoods_cov.json'
#diffs = 'data/auxiliary/kmers/{kmer}/{id}/kmer_diff.tsv'
log:
'logs/'+config['input_folder']+'/probabilistic/kmers/{kmer}/{id}/likelihoods_cov.log'
'logs/{dataset}/{iterset}/probabilistic/kmers/{kmer}/{id}/likelihoods_cov.log'
benchmark:
'benchmarks/'+config['input_folder']+'/probabilistic/kmers/{kmer}/{id}/calcLikelihoodsCoverageBasedModel.txt'
'benchmarks/{dataset}/{iterset}/probabilistic/kmers/{kmer}/{id}/calcLikelihoodsCoverageBasedModel.txt'
params:
e = (lambda wildcards,input : extractTsvValue(input.kmerError,0)),
deviationCutoff = (lambda wildcards,input : round(config['deviationCutoff']*extractCoverageEstimateFile(input.kmerCoverageEstimate,config))),
itersetType = config['itersetType'],
itersetType = lambda wildcards: wildcards.iterset,
#cluster exectuion
cpus = '8',
mem = '20G',
gpus = '0',
walltime = '03:00:00'
singularity:
'docker://phspo/ckmertools:iterationset-tests'
'singularity_container/ckmertools_iterset.sif'
#'docker://phspo/ckmertools:iterationset-tests'
shell:
'c_kmertools --e {input.expected} --c {params.cpus} --m 0 --o {input.observed} --kmererror {params.e} --d {params.deviationCutoff} --target {output.likelihoods} --unexpected {output.unexpectedLikelihoods} --log {log} --itersetType {params.itersetType} --hammingdist {input.V_kmers_distances} --kmersindex {input.V_kmers}'
......@@ -94,10 +95,10 @@ rule calcLikelihoods:
rule calcLikelihoods_Generative:
input:
counts = 'data/auxiliary/kmers/{kmer}/spaSequences.counts.json',
observed = 'data/auxiliary/'+config['input_folder']+'/kmers/{kmer}/{id}/alignment.counts.json',
baseError = 'data/auxiliary/'+config['input_folder']+'/{id}/base_error_estimate.txt'
observed = 'data/auxiliary/{dataset}/kmers/{kmer}/{id}/alignment.counts.json',
baseError = 'data/auxiliary/{dataset}/{id}/base_error_estimate.txt'
output:
likelihoods = 'data/auxiliary/'+config['input_folder']+'/kmers/{kmer}/{id}/likelihoods.json'
likelihoods = 'data/auxiliary/{dataset}/kmers/{kmer}/{id}/likelihoods.json'
params:
cpus = '1',
mem = '3G',
......@@ -107,22 +108,23 @@ rule calcLikelihoods_Generative:
k = lambda wildcards: wildcards.kmer,
e = lambda wildcards,input : extractTsvValue(input.baseError,0,True)
singularity:
'docker://phspo/ckmertools:latest'
'singularity_container/ckmertools_iterset.sif'
#'docker://phspo/ckmertools:latest'
log:
'logs/'+config['input_folder']+'/probabilistic/kmers/{kmer}/{id}/likelihoods.log'
'logs/{dataset}/probabilistic/kmers/{kmer}/{id}/likelihoods.log'
benchmark:
'benchmarks/'+config['input_folder']+'/probabilistic/kmers/{kmer}/{id}/calcLikelihoodsGenerativeModel.txt'
'benchmarks/{dataset}/probabilistic/kmers/{kmer}/{id}/calcLikelihoodsGenerativeModel.txt'
shell:
'c_kmertools --p {input.counts} --m 1 --c {params.cpus} --o {input.observed} --baseerror {params.e} --k {params.k} --target {output.likelihoods} --log {log}'
rule estimateErrorRates:
input:
read1 = 'data/auxiliary/'+config['input_folder']+'/{id}'+'.qc'+config['input_read_1_ending'],
read2 = 'data/auxiliary/'+config['input_folder']+'/{id}'+'.qc'+config['input_read_2_ending']
read1 = 'data/auxiliary/{dataset}/{id}'+'.qc_internal_R1.fq',
read2 = 'data/auxiliary/{dataset}/{id}'+'.qc_internal_R2.fq'
output:
baseError = 'data/auxiliary/'+config['input_folder']+'/{id}/base_error_estimate.txt'
baseError = 'data/auxiliary/{dataset}/{id}/base_error_estimate.txt'
log:
'logs/'+config['input_folder']+'/kmers/{id}/estimateErrorRates.log'
'logs/{dataset}/kmers/{id}/estimateErrorRates.log'
params:
# cluster execution
cpus = '1',
......@@ -149,12 +151,12 @@ rule calcAverageCoverage:
####DEBUG RULES#####
rule calcKmerStats:
input:
expected = 'data/auxiliary/'+config['input_folder']+'/kmers/{kmer}/{id}/expected_counts.json',
observed = 'data/auxiliary/'+config['input_folder']+'/kmers/{kmer}/{id}/alignment.counts.json',
error = 'data/auxiliary/'+config['input_folder']+'/kmers/{kmer}/{id}/error_estimate.txt',
expected = 'data/auxiliary/{dataset}/kmers/{kmer}/{id}/expected_counts.json',
observed = 'data/auxiliary/{dataset}/kmers/{kmer}/{id}/alignment.counts.json',
error = 'data/auxiliary/{dataset}/kmers/{kmer}/{id}/error_estimate.txt',
coverage_estimate = determineKmerCoverageEstimateFile()
output:
stats = 'data/auxiliary/'+config['input_folder']+'/kmers/{kmer}/{id}/stats.tsv'
stats = 'data/auxiliary/{dataset}/kmers/{kmer}/{id}/stats.tsv'
params:
id = lambda wildcards: wildcards.id,
k = lambda wildcards: wildcards.kmer,
......@@ -166,6 +168,6 @@ rule calcKmerStats:
conda:
'../envs/biopythonworkbench.yaml'
log:
'logs/'+config['input_folder']+'/probabilistic/kmers/{kmer}/{id}/probabilities.log'
'logs/{dataset}/probabilistic/kmers/{kmer}/{id}/probabilities.log'
script:
'../scripts/calcKmerStats.py'
rule qualityControl:
input:
read1 = 'data/input/'+config['input_folder']+'/{id}'+config['input_read_1_ending'],
read2 = 'data/input/'+ config['input_folder'] + '/{id}' + config['input_read_2_ending']
read1 = lambda wildcards: 'data/input/{dataset}/{id}'+config['datasets'][wildcards.dataset]['input_read_1_ending'],
read2 = lambda wildcards: 'data/input/{dataset}/{id}' + config['datasets'][wildcards.dataset]['input_read_2_ending']
output:
read1 = 'data/auxiliary/'+config['input_folder']+'/{id}'+'.qc'+config['input_read_1_ending'],
read2= 'data/auxiliary/' + config['input_folder'] + '/{id}' + '.qc' + config['input_read_2_ending'],
jsonreport = 'data/auxiliary/' + config['input_folder'] + '/qc/{id}' + '.json',
htmlreport = report('data/auxiliary/' + config['input_folder'] + '/qc/{id}' + '.html',category='Preprocessing',caption='../report/qualityControl.rst')
read1 = 'data/auxiliary/{dataset}/{id}'+'.qc_internal_R1.fq',
read2= 'data/auxiliary/{dataset}/{id}' + '.qc_internal_R2.fq',
jsonreport = 'data/auxiliary/{dataset}/qc/{id}' + '.json',
htmlreport = report('data/auxiliary/{dataset}/qc/{id}' + '.html',category='Preprocessing',caption='../report/qualityControl.rst')
params:
# cluster execution
cpus = '1',
......@@ -14,7 +14,7 @@ rule qualityControl:
gpus = '0',
walltime = '00:15:00'
log:
'logs/' + config['input_folder'] + '/{id}/qualityControl.log'
'logs/{dataset}/{id}/qualityControl.log'
conda:
'../envs/fastp.yaml'
shell:
......@@ -114,9 +114,9 @@ rule sort_bwa:
rule summarize:
input:
results = expand('data/auxiliary/'+config['input_folder']+'/kmers/{{kmer}}/{id}/scores.{{mode}}.tsv',id=inputIDs)
results = lambda wildcards: expand('data/auxiliary/{dataset}/{iterset}/kmers/{{kmer}}/{id}/scores.{{mode}}.tsv',id=dataset_inputIDs[wildcards.dataset], allow_missing=True)
output:
summary = report('data/output/'+config['input_folder']+'/kmers/{kmer}/predictions.{mode}.tsv',category="Spa-Type Predictions",caption="../report/prediction.snk")
summary = report('data/output/{dataset}/{iterset}/kmers/{kmer}/predictions.{mode}.tsv',category="Spa-Type Predictions",caption="../report/prediction.snk")
params:
# cluster execution
cpus = '1',
......@@ -130,10 +130,10 @@ rule summarize:
rule metaSummarize:
input:
summary = expand('data/output/'+config['input_folder']+'/kmers/{kmer}/predictions.{{mode}}.tsv',kmer=kmer_lengths),
summary = expand('data/output/{dataset}/{iterset}/kmers/{kmer}/predictions.{{mode}}.tsv',kmer=kmer_lengths, allow_missing=True),
groundTruth = 'data/input/' + config['ground_truth']
output:
meta = 'data/output/'+config['input_folder']+'/metaSummary.{mode}.tsv'
meta = 'data/output/{dataset}/{iterset}/metaSummary.{mode}.tsv'
params:
# cluster execution
cpus = '1',
......@@ -143,7 +143,7 @@ rule metaSummarize:
conda:
'../envs/biopythonworkbench.yaml'
log:
'logs/'+config['input_folder']+'/metaSummary.{mode}.log'
'logs/{dataset}/{iterset}/metaSummary.{mode}.log'
script:
'../scripts/metaSummarize.py'
......@@ -2,18 +2,28 @@ $schema: "http://json-schema.org/draft-04/schema#"
description: snakemake config file
definitions:
dataset:
type: "object"
properties:
description: the input folder, in which the reads that are to be classified are located
input_read_1_ending: {type: "string"} #, description: the file ending for the first read (paired-end)}
input_read_2_ending: {type: "string"} #, description: the file ending for the second read (paired-end)}
required: [input_read_1_ending, input_read_2_ending]
properties:
input_folder:
type: string
description: the input folder, in which the reads that are to be classified are located
input_read_1_ending:
type: string
description: the file ending for the first read (paired-end)
input_read_2_ending:
type: string
description: the file ending for the second read (paired-end)
required:
- input_folder
- input_read_1_ending
- input_read_2_ending
spa_repeats_file: {type: "string"}
spa_types: {type: "string"}
reference_genome: {type: "string"}
genome_file_identifier: {type: "string"}
datasets:
type: "object"
patternProperties:
"^.*$": { "$ref": "#/definitions/dataset" }
additionalProperties: {"type": object,"properties": {"^.*$": { "$ref": "#/definitions/dataset" }} }
test:
type: "string"
required: [datasets, spa_repeats_file, spa_types, reference_genome, genome_file_identifier]
# Image Setup
singularity build ckmertools_iterset.sif ckmertools.def
\ No newline at end of file
Bootstrap: docker
From: phspo/ckmertools:latest
%environment
export PATH=/ckmertools/build:${PATH}
%post
git clone https://github.com/rogersce/cnpy.git
mkdir cnpy/build
cd cnpy/build && git checkout 4e8810b1a8637695171ed346ce68f6984e585ef4
cmake .. && make && make install
cd /ckmertools && git fetch && git checkout iterationset-tests
cd /ckmertools/build && cmake ../ && make
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment