Commit 050a29c1 authored by Jan Hoeckesfeld's avatar Jan Hoeckesfeld
Browse files

added iterset wildcard

parent cc4d7666
......@@ -14,31 +14,61 @@ kmer_lengths = config['kmers']
dataset_inputIDs = {}
#kmer_lengths = [24]
def get_inputs(dataset):
run_dir = dataset
inputIDs, = glob_wildcards('data/input/'+dataset+'/{id}'+config["datasets"][dataset]['input_read_1_ending'])
dataset_inputIDs[dataset] = inputIDs
possible_params = {'generative_model': expand('data/output/'+run_dir+'/kmers/{kmer}/predictions.probabilistic_gen.tsv',kmer=kmer_lengths),
'probabilistic_model': expand('data/output/'+run_dir+'/kmers/{kmer}/predictions.probabilistic_cov.tsv',kmer=kmer_lengths),
# if above:
'plot_top3_fit': expand('data/output/'+run_dir+'/kmers/{kmer}/{id}_top3fit.svg',kmer=kmer_lengths,id=inputIDs),
'distance_model': expand('data/output/'+run_dir+'/kmers/{kmer}/predictions.euclidean.tsv',kmer=kmer_lengths),
def get_general_inputs(wildcards):
run_dir = wildcards.dataset
inputIDs, = glob_wildcards('data/input/'+wildcards.dataset+'/{id}'+config["datasets"][wildcards.dataset]['input_read_1_ending'])
dataset_inputIDs[wildcards.dataset] = inputIDs
possible_params = {
'assembly_model': expand('data/output/'+run_dir+'/{id}/exactMatches.tsv',id=inputIDs),
'calc_strand_bias': expand('data/output/'+run_dir+'/{id}/strandbias.txt',id=inputIDs),
'mapping_diff_analysis': expand('data/output/'+run_dir+'/methodAnalysis/{id}/mapping.comparison',id=inputIDs),
'map_filtered_reads': expand('data/output/'+run_dir+'/methodAnalysis/{id}/alignmentToGroundTruthType.sorted.bam.bai',id=inputIDs),
'verifyUniqueness': expand('data/output/kmers/{kmer}/uniquenessTest.tsv',kmer=kmer_lengths),
'kmer_stats_analysis': expand('data/auxiliary/'+run_dir+'/kmers/{kmer}/{id}/stats.tsv',kmer=kmer_lengths,id=inputIDs) +
expand('data/output/'+run_dir+'/kmers/{kmer}/{id}/spaTypesGroundTruthVennDia.svg',kmer=kmer_lengths,id=inputIDs)
'kmer_stats_analysis': expand('data/auxiliary/'+run_dir+'/kmers/{kmer}/{id}/stats.tsv',kmer=kmer_lengths,id=inputIDs)
}
return [item for k in possible_params.keys() if config[k] for item in possible_params[k]]
def get_iterset_inputs(wildcards):
inputIDs, = glob_wildcards('data/input/'+wildcards.dataset+'/{id}'+config["datasets"][wildcards.dataset]['input_read_1_ending'])
possible_params = {
'generative_model': expand('data/output/' + wildcards.dataset +'/' + wildcards.iterset +'/kmers/{kmer}/predictions.probabilistic_gen.tsv',kmer=kmer_lengths),
'distance_model': expand('data/output/' + wildcards.dataset +'/' + wildcards.iterset +'/kmers/{kmer}/predictions.euclidean.tsv',kmer=kmer_lengths),
'probabilistic_model': expand('data/output/' + wildcards.dataset +'/' + wildcards.iterset + '/kmers/{kmer}/predictions.probabilistic_cov.tsv',kmer=kmer_lengths),
# if above:
'plot_top3_fit': expand('data/output/' + wildcards.dataset +'/' + wildcards.iterset + '/kmers/{kmer}/{id}_top3fit.svg',kmer=kmer_lengths,id=inputIDs),
'kmer_stats_analysis': expand('data/output/' + wildcards.dataset + '/' + wildcards.iterset +'/kmers/{kmer}/{id}/spaTypesGroundTruthVennDia.svg',kmer=kmer_lengths,id=inputIDs)
}
return [item for k in possible_params.keys() if config[k] for item in possible_params[k]]
return [possible_params[k] for k in possible_params.keys() if config[k]]
def use_itersets():
if config['probabilistic_model'] and config['itersets']:
return config['itersets']
return ['O']
rule all:
input:
[get_inputs(dataset) for dataset in config["datasets"].keys()]
run_datasets = expand('data/output/{dataset}_summary.md', dataset=config['datasets'].keys())
rule run_dataset:
input:
general = get_general_inputs,
summarys = expand('data/auxiliary/{dataset}/{iterset}_summary.md', iterset=use_itersets(), allow_missing=True)
output:
summary = 'data/output/{dataset}_summary.md'
# TODO create summary
shell:
'touch {output.out}'
rule run_iterset:
input:
get_iterset_inputs
output:
out = 'data/auxiliary/{dataset}/{iterset}_summary.md'
# TODO create summary
shell:
'touch {output.out}'
##### load rules #####
include: "rules/assembly.smk"
......
......@@ -94,7 +94,7 @@ deviationCutoff : 2.5
skipMapping: False
plot_top3_fit : False
#choose the iterationset either O, V, OuV (O union V) or OnV (O intersect V)
itersetType : O
itersets: [O,V,OnV]
###Blast Parameter
blast_word_size : 4
......
......@@ -83,9 +83,9 @@ rule estimateCoverageAlignment:
rule calcPriorProbabilitiesCoverage:
input:
likelihoods = 'data/auxiliary/{dataset}/kmers/{kmer}/{id}/likelihoods_cov.json'
likelihoods = 'data/auxiliary/{dataset}/{iterset}/kmers/{kmer}/{id}/likelihoods_cov.json'
output:
priorFilePath = 'data/auxiliary/{dataset}/kmers/{kmer}/{id}/prior_cov.txt'
priorFilePath = 'data/auxiliary/{dataset}/{iterset}/kmers/{kmer}/{id}/prior_cov.txt'
params:
k = lambda wildcards: wildcards.kmer,
dps = config['dps'],
......@@ -95,7 +95,7 @@ rule calcPriorProbabilitiesCoverage:
mem = '2G',
walltime = '00:05:30'
log:
'logs/{dataset}/probabilistic/kmers/{kmer}/{id}/calcPrior_cov.log'
'logs/{dataset}/{iterset}/probabilistic/kmers/{kmer}/{id}/calcPrior_cov.log'
conda:
'../envs/biopythonworkbench.yaml'
script:
......@@ -103,10 +103,10 @@ rule calcPriorProbabilitiesCoverage:
rule calcProbabilitiesCoverage:
input:
likelihoods = 'data/auxiliary/{dataset}/kmers/{kmer}/{id}/likelihoods_cov.json',
prior = 'data/auxiliary/{dataset}/kmers/{kmer}/{id}/prior_cov.txt'
likelihoods = 'data/auxiliary/{dataset}/{iterset}/kmers/{kmer}/{id}/likelihoods_cov.json',
prior = 'data/auxiliary/{dataset}/{iterset}/kmers/{kmer}/{id}/prior_cov.txt'
output:
probabilities = 'data/auxiliary/{dataset}/kmers/{kmer}/{id}/scores.probabilistic_cov.tsv'
probabilities = 'data/auxiliary/{dataset}/{iterset}/kmers/{kmer}/{id}/scores.probabilistic_cov.tsv'
params:
dps = config['dps'],
# cluster execution
......@@ -117,7 +117,7 @@ rule calcProbabilitiesCoverage:
conda:
'../envs/biopythonworkbench.yaml'
log:
'logs/{dataset}/probabilistic/kmers/{kmer}/{id}/probabilities_cov.log'
'logs/{dataset}/{iterset}/probabilistic/kmers/{kmer}/{id}/probabilities_cov.log'
script:
'../scripts/calcSpaTypeProbabilities.py'
......@@ -125,10 +125,10 @@ rule calcProbabilitiesCoverage:
rule createFitnessPlots:
input:
counts = 'data/auxiliary/{dataset}/kmers/{kmer}/{id}/alignment.counts.json',
probabilities = 'data/auxiliary/{dataset}/kmers/{kmer}/{id}/scores.probabilistic_cov.tsv',
probabilities = 'data/auxiliary/{dataset}/{iterset}/kmers/{kmer}/{id}/scores.probabilistic_cov.tsv',
ratios = 'data/auxiliary/kmers/{kmer}/spaSequencesRatios.json'
output:
report('data/output/{dataset}/kmers/{kmer}/{id}_top3fit.svg',category='Coverage-Based Method',caption='../report/fitnessPlot.rst')
report('data/output/{dataset}/{iterset}/kmers/{kmer}/{id}_top3fit.svg',category='Coverage-Based Method',caption='../report/fitnessPlot.rst')
params:
dps = config['dps'],
# cluster execution
......
......@@ -3,7 +3,7 @@ rule distance:
readProfile = 'data/auxiliary/{dataset}/kmers/{kmer}/{id}/alignment.profile.json',
spaProfiles = 'data/auxiliary/kmers/{kmer}/spaSequences.kmerprofiles.json'
output:
'data/auxiliary/{dataset}/kmers/{kmer}/{id}/scores.euclidean.tsv'
'data/auxiliary/{dataset}/{iterset}/kmers/{kmer}/{id}/scores.euclidean.tsv'
conda:
'../envs/biopythonworkbench.yaml'
params:
......
......@@ -169,11 +169,11 @@ rule createKmerDistributionGroundTruth_COVERAGE_BASED:
input:
expectedCounts = 'data/auxiliary/{dataset}/kmers/{kmer}/{id}/expected_counts.json',
observedCounts = 'data/auxiliary/{dataset}/kmers/{kmer}/{id}/alignment.counts.json',
probabilities = 'data/auxiliary/{dataset}/kmers/{kmer}/{id}/scores.probabilistic_cov.tsv',
probabilities = 'data/auxiliary/{dataset}/{iterset}/kmers/{kmer}/{id}/scores.probabilistic_cov.tsv',
kmerError = 'data/auxiliary/{dataset}/kmers/{kmer}/{id}/kmer_error.txt'
output:
errors = 'data/output/{dataset}/methodAnalysis/{kmer}/{id}/kmerErrorDistributions.svg',
deviations = 'data/output/{dataset}/methodAnalysis/{kmer}/{id}/countDeviations.svg'
errors = 'data/output/{dataset}/{iterset}/methodAnalysis/{kmer}/{id}/kmerErrorDistributions.svg',
deviations = 'data/output/{dataset}/{iterset}/methodAnalysis/{kmer}/{id}/countDeviations.svg'
params:
gt = lambda wildcards : getGroundTruthType(wildcards.id),
#cluster execution
......@@ -184,7 +184,7 @@ rule createKmerDistributionGroundTruth_COVERAGE_BASED:
conda:
'../envs/biopythonworkbench.yaml'
log:
'logs/{dataset}/methodAnalysis/{kmer}/{id}/kmerErrorDistributions.svg'
'logs/{dataset}/{iterset}/methodAnalysis/{kmer}/{id}/kmerErrorDistributions.svg'
script:
'../scripts/createKmerErrorDistributionPlots.py'
......@@ -193,16 +193,16 @@ rule likelihoodAnalysis_COVERAGE_BASED:
input:
expectedCounts = 'data/auxiliary/{dataset}/kmers/{kmer}/{id}/expected_counts.json',
observedCounts = 'data/auxiliary/{dataset}/kmers/{kmer}/{id}/alignment.counts.json',
probabilities = 'data/auxiliary/{dataset}/kmers/{kmer}/{id}/scores.probabilistic_cov.tsv',
probabilities = 'data/auxiliary/{dataset}/{iterset}/kmers/{kmer}/{id}/scores.probabilistic_cov.tsv',
kmerError = 'data/auxiliary/{dataset}/kmers/{kmer}/{id}/kmer_error.txt'
output:
likelihoodAnalysis = 'data/output/{dataset}/methodAnalysis/{kmer}/{id}/likelihoodAnalysis.txt'
likelihoodAnalysis = 'data/output/{dataset}/{iterset}/methodAnalysis/{kmer}/{id}/likelihoodAnalysis.txt'
params:
gt = lambda wildcards : getGroundTruthType(wildcards.id)
conda:
'../envs/biopythonworkbench.yaml'
log:
'logs/{dataset}/methodAnalysis/{kmer}/{id}/likelihoodAnalysis.txt'
'logs/{dataset}/{iterset}/methodAnalysis/{kmer}/{id}/likelihoodAnalysis.txt'
script:
'../scripts/likelihoodBreakdown.py'
......@@ -337,15 +337,15 @@ rule createSpaTypeVennDiagram:
input:
expected = 'data/auxiliary/{dataset}/kmers/{kmer}/{id}/expected_counts.json',
observed = 'data/auxiliary/{dataset}/kmers/{kmer}/{id}/alignment.counts.json',
scores = 'data/auxiliary/{dataset}/kmers/{kmer}/{id}/scores.probabilistic_cov.tsv'
scores = 'data/auxiliary/{dataset}/{iterset}/kmers/{kmer}/{id}/scores.probabilistic_cov.tsv'
output:
venngtt = 'data/output/{dataset}/kmers/{kmer}/{id}/spaTypesGroundTruthVennDia.svg',
venntopsix = 'data/output/{dataset}/kmers/{kmer}/{id}/spaTypesTopSixVennDia.svg',
vennrandomsix = 'data/output/{dataset}/kmers/{kmer}/{id}/spaTypesRandomSixVennDia.svg'
venngtt = 'data/output/{dataset}/{iterset}/kmers/{kmer}/{id}/spaTypesGroundTruthVennDia.svg',
venntopsix = 'data/output/{dataset}/{iterset}/kmers/{kmer}/{id}/spaTypesTopSixVennDia.svg',
vennrandomsix = 'data/output/{dataset}/{iterset}/kmers/{kmer}/{id}/spaTypesRandomSixVennDia.svg'
params:
gtt = lambda wildcards : getGroundTruthType(wildcards.id)
log:
'logs/{dataset}/probabilistic/kmers/{kmer}/{id}/spaTypeVennDia.log'
'logs/{dataset}/{iterset}/probabilistic/kmers/{kmer}/{id}/spaTypeVennDia.log'
conda:
'../envs/biopythonworkbench.yaml'
script:
......
......@@ -37,7 +37,7 @@ rule calcProbabilities:
likelihoods = 'data/auxiliary/{dataset}/kmers/{kmer}/{id}/likelihoods.json',
prior = 'data/auxiliary/{dataset}/kmers/{kmer}/{id}/prior.txt'
output:
probabilities = 'data/auxiliary/{dataset}/kmers/{kmer}/{id}/scores.probabilistic_gen.tsv'
probabilities = 'data/auxiliary/{dataset}/{iterset}/kmers/{kmer}/{id}/scores.probabilistic_gen.tsv'
params:
dps = config['dps'],
cpus = '1',
......@@ -47,7 +47,7 @@ rule calcProbabilities:
conda:
'../envs/biopythonworkbench.yaml'
log:
'logs/{dataset}/probabilistic/kmers/{kmer}/{id}/probabilities.log'
'logs/{dataset}/{iterset}/probabilistic/kmers/{kmer}/{id}/probabilities.log'
script:
'../scripts/calcSpaTypeProbabilities.py'
......@@ -69,17 +69,17 @@ rule calcLikelihoods:
V_kmers_distances = 'data/auxiliary/kmers/{kmer}/V_kmers.distances.npz',
V_kmers = 'data/auxiliary/kmers/{kmer}/V_kmers.json'
output:
likelihoods = 'data/auxiliary/{dataset}/kmers/{kmer}/{id}/likelihoods_cov.json',
unexpectedLikelihoods = 'data/auxiliary/{dataset}/kmers/{kmer}/{id}/unexpected_likelihoods_cov.json'
likelihoods = 'data/auxiliary/{dataset}/{iterset}/kmers/{kmer}/{id}/likelihoods_cov.json',
unexpectedLikelihoods = 'data/auxiliary/{dataset}/{iterset}/kmers/{kmer}/{id}/unexpected_likelihoods_cov.json'
#diffs = 'data/auxiliary/kmers/{kmer}/{id}/kmer_diff.tsv'
log:
'logs/{dataset}/probabilistic/kmers/{kmer}/{id}/likelihoods_cov.log'
'logs/{dataset}/{iterset}/probabilistic/kmers/{kmer}/{id}/likelihoods_cov.log'
benchmark:
'benchmarks/{dataset}/probabilistic/kmers/{kmer}/{id}/calcLikelihoodsCoverageBasedModel.txt'
'benchmarks/{dataset}/{iterset}/probabilistic/kmers/{kmer}/{id}/calcLikelihoodsCoverageBasedModel.txt'
params:
e = (lambda wildcards,input : extractTsvValue(input.kmerError,0)),
deviationCutoff = (lambda wildcards,input : round(config['deviationCutoff']*extractCoverageEstimateFile(input.kmerCoverageEstimate,config))),
itersetType = config['itersetType'],
itersetType = lambda wildcards: wildcards.iterset,
#cluster exectuion
cpus = '8',
mem = '15G',
......
......@@ -114,9 +114,9 @@ rule sort_bwa:
rule summarize:
input:
results = lambda wildcards: expand('data/auxiliary/{dataset}/kmers/{{kmer}}/{id}/scores.{{mode}}.tsv',id=dataset_inputIDs[wildcards.dataset], allow_missing=True)
results = lambda wildcards: expand('data/auxiliary/{dataset}/{iterset}/kmers/{{kmer}}/{id}/scores.{{mode}}.tsv',id=dataset_inputIDs[wildcards.dataset], allow_missing=True)
output:
summary = report('data/output/{dataset}/kmers/{kmer}/predictions.{mode}.tsv',category="Spa-Type Predictions",caption="../report/prediction.snk")
summary = report('data/output/{dataset}/{iterset}/kmers/{kmer}/predictions.{mode}.tsv',category="Spa-Type Predictions",caption="../report/prediction.snk")
params:
# cluster execution
cpus = '1',
......@@ -130,10 +130,10 @@ rule summarize:
rule metaSummarize:
input:
summary = expand('data/output/{dataset}/kmers/{kmer}/predictions.{{mode}}.tsv',kmer=kmer_lengths, allow_missing=True),
summary = expand('data/output/{dataset}/{iterset}/kmers/{kmer}/predictions.{{mode}}.tsv',kmer=kmer_lengths, allow_missing=True),
groundTruth = 'data/input/' + config['ground_truth']
output:
meta = 'data/output/{dataset}/metaSummary.{mode}.tsv'
meta = 'data/output/{dataset}/{iterset}/metaSummary.{mode}.tsv'
params:
# cluster execution
cpus = '1',
......@@ -143,7 +143,7 @@ rule metaSummarize:
conda:
'../envs/biopythonworkbench.yaml'
log:
'logs/{dataset}/metaSummary.{mode}.log'
'logs/{dataset}/{iterset}/metaSummary.{mode}.log'
script:
'../scripts/metaSummarize.py'
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment