Skip to content
GitLab
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Sign in
Toggle navigation
Menu
Open sidebar
Jan Hoeckesfeld
Snakemake Ngs Spa Typing
Commits
050a29c1
Commit
050a29c1
authored
Jan 20, 2021
by
Jan Hoeckesfeld
Browse files
added iterset wildcard
parent
cc4d7666
Changes
7
Hide whitespace changes
Inline
Side-by-side
Snakefile
View file @
050a29c1
...
...
@@ -14,31 +14,61 @@ kmer_lengths = config['kmers']
dataset_inputIDs = {}
#kmer_lengths = [24]
def get_inputs(dataset):
run_dir = dataset
inputIDs, = glob_wildcards('data/input/'+dataset+'/{id}'+config["datasets"][dataset]['input_read_1_ending'])
dataset_inputIDs[dataset] = inputIDs
possible_params = {'generative_model': expand('data/output/'+run_dir+'/kmers/{kmer}/predictions.probabilistic_gen.tsv',kmer=kmer_lengths),
'probabilistic_model': expand('data/output/'+run_dir+'/kmers/{kmer}/predictions.probabilistic_cov.tsv',kmer=kmer_lengths),
# if above:
'plot_top3_fit': expand('data/output/'+run_dir+'/kmers/{kmer}/{id}_top3fit.svg',kmer=kmer_lengths,id=inputIDs),
'distance_model': expand('data/output/'+run_dir+'/kmers/{kmer}/predictions.euclidean.tsv',kmer=kmer_lengths),
def get_general_inputs(wildcards):
run_dir = wildcards.dataset
inputIDs, = glob_wildcards('data/input/'+wildcards.dataset+'/{id}'+config["datasets"][wildcards.dataset]['input_read_1_ending'])
dataset_inputIDs[wildcards.dataset] = inputIDs
possible_params = {
'assembly_model': expand('data/output/'+run_dir+'/{id}/exactMatches.tsv',id=inputIDs),
'calc_strand_bias': expand('data/output/'+run_dir+'/{id}/strandbias.txt',id=inputIDs),
'mapping_diff_analysis': expand('data/output/'+run_dir+'/methodAnalysis/{id}/mapping.comparison',id=inputIDs),
'map_filtered_reads': expand('data/output/'+run_dir+'/methodAnalysis/{id}/alignmentToGroundTruthType.sorted.bam.bai',id=inputIDs),
'verifyUniqueness': expand('data/output/kmers/{kmer}/uniquenessTest.tsv',kmer=kmer_lengths),
'kmer_stats_analysis': expand('data/auxiliary/'+run_dir+'/kmers/{kmer}/{id}/stats.tsv',kmer=kmer_lengths,id=inputIDs) +
expand('data/output/'+run_dir+'/kmers/{kmer}/{id}/spaTypesGroundTruthVennDia.svg',kmer=kmer_lengths,id=inputIDs)
'kmer_stats_analysis': expand('data/auxiliary/'+run_dir+'/kmers/{kmer}/{id}/stats.tsv',kmer=kmer_lengths,id=inputIDs)
}
return [item for k in possible_params.keys() if config[k] for item in possible_params[k]]
def get_iterset_inputs(wildcards):
inputIDs, = glob_wildcards('data/input/'+wildcards.dataset+'/{id}'+config["datasets"][wildcards.dataset]['input_read_1_ending'])
possible_params = {
'generative_model': expand('data/output/' + wildcards.dataset +'/' + wildcards.iterset +'/kmers/{kmer}/predictions.probabilistic_gen.tsv',kmer=kmer_lengths),
'distance_model': expand('data/output/' + wildcards.dataset +'/' + wildcards.iterset +'/kmers/{kmer}/predictions.euclidean.tsv',kmer=kmer_lengths),
'probabilistic_model': expand('data/output/' + wildcards.dataset +'/' + wildcards.iterset + '/kmers/{kmer}/predictions.probabilistic_cov.tsv',kmer=kmer_lengths),
# if above:
'plot_top3_fit': expand('data/output/' + wildcards.dataset +'/' + wildcards.iterset + '/kmers/{kmer}/{id}_top3fit.svg',kmer=kmer_lengths,id=inputIDs),
'kmer_stats_analysis': expand('data/output/' + wildcards.dataset + '/' + wildcards.iterset +'/kmers/{kmer}/{id}/spaTypesGroundTruthVennDia.svg',kmer=kmer_lengths,id=inputIDs)
}
return [item for k in possible_params.keys() if config[k] for item in possible_params[k]]
return [possible_params[k] for k in possible_params.keys() if config[k]]
def use_itersets():
if config['probabilistic_model'] and config['itersets']:
return config['itersets']
return ['O']
rule all:
input:
[get_in
put
s(
dataset
) for
dataset
in
config[
"
datasets
"
].keys()
]
run_datasets = expand('data/out
put
/{
dataset
}_summary.md',
dataset
=
config[
'
datasets
'
].keys()
)
rule run_dataset:
input:
general = get_general_inputs,
summarys = expand('data/auxiliary/{dataset}/{iterset}_summary.md', iterset=use_itersets(), allow_missing=True)
output:
summary = 'data/output/{dataset}_summary.md'
# TODO create summary
shell:
'touch {output.out}'
rule run_iterset:
input:
get_iterset_inputs
output:
out = 'data/auxiliary/{dataset}/{iterset}_summary.md'
# TODO create summary
shell:
'touch {output.out}'
##### load rules #####
include: "rules/assembly.smk"
...
...
config.example.yaml
View file @
050a29c1
...
...
@@ -94,7 +94,7 @@ deviationCutoff : 2.5
skipMapping
:
False
plot_top3_fit
:
False
#choose the iterationset either O, V, OuV (O union V) or OnV (O intersect V)
iterset
Type
:
O
iterset
s
:
[
O
,
V
,
OnV
]
###Blast Parameter
blast_word_size
:
4
...
...
rules/coverageBased.smk
View file @
050a29c1
...
...
@@ -83,9 +83,9 @@ rule estimateCoverageAlignment:
rule calcPriorProbabilitiesCoverage:
input:
likelihoods = 'data/auxiliary/{dataset}/kmers/{kmer}/{id}/likelihoods_cov.json'
likelihoods = 'data/auxiliary/{dataset}/
{iterset}/
kmers/{kmer}/{id}/likelihoods_cov.json'
output:
priorFilePath = 'data/auxiliary/{dataset}/kmers/{kmer}/{id}/prior_cov.txt'
priorFilePath = 'data/auxiliary/{dataset}/
{iterset}/
kmers/{kmer}/{id}/prior_cov.txt'
params:
k = lambda wildcards: wildcards.kmer,
dps = config['dps'],
...
...
@@ -95,7 +95,7 @@ rule calcPriorProbabilitiesCoverage:
mem = '2G',
walltime = '00:05:30'
log:
'logs/{dataset}/probabilistic/kmers/{kmer}/{id}/calcPrior_cov.log'
'logs/{dataset}/
{iterset}/
probabilistic/kmers/{kmer}/{id}/calcPrior_cov.log'
conda:
'../envs/biopythonworkbench.yaml'
script:
...
...
@@ -103,10 +103,10 @@ rule calcPriorProbabilitiesCoverage:
rule calcProbabilitiesCoverage:
input:
likelihoods = 'data/auxiliary/{dataset}/kmers/{kmer}/{id}/likelihoods_cov.json',
prior = 'data/auxiliary/{dataset}/kmers/{kmer}/{id}/prior_cov.txt'
likelihoods = 'data/auxiliary/{dataset}/
{iterset}/
kmers/{kmer}/{id}/likelihoods_cov.json',
prior = 'data/auxiliary/{dataset}/
{iterset}/
kmers/{kmer}/{id}/prior_cov.txt'
output:
probabilities = 'data/auxiliary/{dataset}/kmers/{kmer}/{id}/scores.probabilistic_cov.tsv'
probabilities = 'data/auxiliary/{dataset}/
{iterset}/
kmers/{kmer}/{id}/scores.probabilistic_cov.tsv'
params:
dps = config['dps'],
# cluster execution
...
...
@@ -117,7 +117,7 @@ rule calcProbabilitiesCoverage:
conda:
'../envs/biopythonworkbench.yaml'
log:
'logs/{dataset}/probabilistic/kmers/{kmer}/{id}/probabilities_cov.log'
'logs/{dataset}/
{iterset}/
probabilistic/kmers/{kmer}/{id}/probabilities_cov.log'
script:
'../scripts/calcSpaTypeProbabilities.py'
...
...
@@ -125,10 +125,10 @@ rule calcProbabilitiesCoverage:
rule createFitnessPlots:
input:
counts = 'data/auxiliary/{dataset}/kmers/{kmer}/{id}/alignment.counts.json',
probabilities = 'data/auxiliary/{dataset}/kmers/{kmer}/{id}/scores.probabilistic_cov.tsv',
probabilities = 'data/auxiliary/{dataset}/
{iterset}/
kmers/{kmer}/{id}/scores.probabilistic_cov.tsv',
ratios = 'data/auxiliary/kmers/{kmer}/spaSequencesRatios.json'
output:
report('data/output/{dataset}/kmers/{kmer}/{id}_top3fit.svg',category='Coverage-Based Method',caption='../report/fitnessPlot.rst')
report('data/output/{dataset}/
{iterset}/
kmers/{kmer}/{id}_top3fit.svg',category='Coverage-Based Method',caption='../report/fitnessPlot.rst')
params:
dps = config['dps'],
# cluster execution
...
...
rules/euclidean.smk
View file @
050a29c1
...
...
@@ -3,7 +3,7 @@ rule distance:
readProfile = 'data/auxiliary/{dataset}/kmers/{kmer}/{id}/alignment.profile.json',
spaProfiles = 'data/auxiliary/kmers/{kmer}/spaSequences.kmerprofiles.json'
output:
'data/auxiliary/{dataset}/kmers/{kmer}/{id}/scores.euclidean.tsv'
'data/auxiliary/{dataset}/
{iterset}/
kmers/{kmer}/{id}/scores.euclidean.tsv'
conda:
'../envs/biopythonworkbench.yaml'
params:
...
...
rules/kmerApproach.smk
View file @
050a29c1
...
...
@@ -169,11 +169,11 @@ rule createKmerDistributionGroundTruth_COVERAGE_BASED:
input:
expectedCounts = 'data/auxiliary/{dataset}/kmers/{kmer}/{id}/expected_counts.json',
observedCounts = 'data/auxiliary/{dataset}/kmers/{kmer}/{id}/alignment.counts.json',
probabilities = 'data/auxiliary/{dataset}/kmers/{kmer}/{id}/scores.probabilistic_cov.tsv',
probabilities = 'data/auxiliary/{dataset}/
{iterset}/
kmers/{kmer}/{id}/scores.probabilistic_cov.tsv',
kmerError = 'data/auxiliary/{dataset}/kmers/{kmer}/{id}/kmer_error.txt'
output:
errors = 'data/output/{dataset}/methodAnalysis/{kmer}/{id}/kmerErrorDistributions.svg',
deviations = 'data/output/{dataset}/methodAnalysis/{kmer}/{id}/countDeviations.svg'
errors = 'data/output/{dataset}/
{iterset}/
methodAnalysis/{kmer}/{id}/kmerErrorDistributions.svg',
deviations = 'data/output/{dataset}/
{iterset}/
methodAnalysis/{kmer}/{id}/countDeviations.svg'
params:
gt = lambda wildcards : getGroundTruthType(wildcards.id),
#cluster execution
...
...
@@ -184,7 +184,7 @@ rule createKmerDistributionGroundTruth_COVERAGE_BASED:
conda:
'../envs/biopythonworkbench.yaml'
log:
'logs/{dataset}/methodAnalysis/{kmer}/{id}/kmerErrorDistributions.svg'
'logs/{dataset}/
{iterset}/
methodAnalysis/{kmer}/{id}/kmerErrorDistributions.svg'
script:
'../scripts/createKmerErrorDistributionPlots.py'
...
...
@@ -193,16 +193,16 @@ rule likelihoodAnalysis_COVERAGE_BASED:
input:
expectedCounts = 'data/auxiliary/{dataset}/kmers/{kmer}/{id}/expected_counts.json',
observedCounts = 'data/auxiliary/{dataset}/kmers/{kmer}/{id}/alignment.counts.json',
probabilities = 'data/auxiliary/{dataset}/kmers/{kmer}/{id}/scores.probabilistic_cov.tsv',
probabilities = 'data/auxiliary/{dataset}/
{iterset}/
kmers/{kmer}/{id}/scores.probabilistic_cov.tsv',
kmerError = 'data/auxiliary/{dataset}/kmers/{kmer}/{id}/kmer_error.txt'
output:
likelihoodAnalysis = 'data/output/{dataset}/methodAnalysis/{kmer}/{id}/likelihoodAnalysis.txt'
likelihoodAnalysis = 'data/output/{dataset}/
{iterset}/
methodAnalysis/{kmer}/{id}/likelihoodAnalysis.txt'
params:
gt = lambda wildcards : getGroundTruthType(wildcards.id)
conda:
'../envs/biopythonworkbench.yaml'
log:
'logs/{dataset}/methodAnalysis/{kmer}/{id}/likelihoodAnalysis.txt'
'logs/{dataset}/
{iterset}/
methodAnalysis/{kmer}/{id}/likelihoodAnalysis.txt'
script:
'../scripts/likelihoodBreakdown.py'
...
...
@@ -337,15 +337,15 @@ rule createSpaTypeVennDiagram:
input:
expected = 'data/auxiliary/{dataset}/kmers/{kmer}/{id}/expected_counts.json',
observed = 'data/auxiliary/{dataset}/kmers/{kmer}/{id}/alignment.counts.json',
scores = 'data/auxiliary/{dataset}/kmers/{kmer}/{id}/scores.probabilistic_cov.tsv'
scores = 'data/auxiliary/{dataset}/
{iterset}/
kmers/{kmer}/{id}/scores.probabilistic_cov.tsv'
output:
venngtt = 'data/output/{dataset}/kmers/{kmer}/{id}/spaTypesGroundTruthVennDia.svg',
venntopsix = 'data/output/{dataset}/kmers/{kmer}/{id}/spaTypesTopSixVennDia.svg',
vennrandomsix = 'data/output/{dataset}/kmers/{kmer}/{id}/spaTypesRandomSixVennDia.svg'
venngtt = 'data/output/{dataset}/
{iterset}/
kmers/{kmer}/{id}/spaTypesGroundTruthVennDia.svg',
venntopsix = 'data/output/{dataset}/
{iterset}/
kmers/{kmer}/{id}/spaTypesTopSixVennDia.svg',
vennrandomsix = 'data/output/{dataset}/
{iterset}/
kmers/{kmer}/{id}/spaTypesRandomSixVennDia.svg'
params:
gtt = lambda wildcards : getGroundTruthType(wildcards.id)
log:
'logs/{dataset}/probabilistic/kmers/{kmer}/{id}/spaTypeVennDia.log'
'logs/{dataset}/
{iterset}/
probabilistic/kmers/{kmer}/{id}/spaTypeVennDia.log'
conda:
'../envs/biopythonworkbench.yaml'
script:
...
...
rules/probabilistic.smk
View file @
050a29c1
...
...
@@ -37,7 +37,7 @@ rule calcProbabilities:
likelihoods = 'data/auxiliary/{dataset}/kmers/{kmer}/{id}/likelihoods.json',
prior = 'data/auxiliary/{dataset}/kmers/{kmer}/{id}/prior.txt'
output:
probabilities = 'data/auxiliary/{dataset}/kmers/{kmer}/{id}/scores.probabilistic_gen.tsv'
probabilities = 'data/auxiliary/{dataset}/
{iterset}/
kmers/{kmer}/{id}/scores.probabilistic_gen.tsv'
params:
dps = config['dps'],
cpus = '1',
...
...
@@ -47,7 +47,7 @@ rule calcProbabilities:
conda:
'../envs/biopythonworkbench.yaml'
log:
'logs/{dataset}/probabilistic/kmers/{kmer}/{id}/probabilities.log'
'logs/{dataset}/
{iterset}/
probabilistic/kmers/{kmer}/{id}/probabilities.log'
script:
'../scripts/calcSpaTypeProbabilities.py'
...
...
@@ -69,17 +69,17 @@ rule calcLikelihoods:
V_kmers_distances = 'data/auxiliary/kmers/{kmer}/V_kmers.distances.npz',
V_kmers = 'data/auxiliary/kmers/{kmer}/V_kmers.json'
output:
likelihoods = 'data/auxiliary/{dataset}/kmers/{kmer}/{id}/likelihoods_cov.json',
unexpectedLikelihoods = 'data/auxiliary/{dataset}/kmers/{kmer}/{id}/unexpected_likelihoods_cov.json'
likelihoods = 'data/auxiliary/{dataset}/
{iterset}/
kmers/{kmer}/{id}/likelihoods_cov.json',
unexpectedLikelihoods = 'data/auxiliary/{dataset}/
{iterset}/
kmers/{kmer}/{id}/unexpected_likelihoods_cov.json'
#diffs = 'data/auxiliary/kmers/{kmer}/{id}/kmer_diff.tsv'
log:
'logs/{dataset}/probabilistic/kmers/{kmer}/{id}/likelihoods_cov.log'
'logs/{dataset}/
{iterset}/
probabilistic/kmers/{kmer}/{id}/likelihoods_cov.log'
benchmark:
'benchmarks/{dataset}/probabilistic/kmers/{kmer}/{id}/calcLikelihoodsCoverageBasedModel.txt'
'benchmarks/{dataset}/
{iterset}/
probabilistic/kmers/{kmer}/{id}/calcLikelihoodsCoverageBasedModel.txt'
params:
e = (lambda wildcards,input : extractTsvValue(input.kmerError,0)),
deviationCutoff = (lambda wildcards,input : round(config['deviationCutoff']*extractCoverageEstimateFile(input.kmerCoverageEstimate,config))),
itersetType =
config['
iterset
Type']
,
itersetType =
lambda wildcards: wildcards.
iterset,
#cluster exectuion
cpus = '8',
mem = '15G',
...
...
rules/shared.smk
View file @
050a29c1
...
...
@@ -114,9 +114,9 @@ rule sort_bwa:
rule summarize:
input:
results = lambda wildcards: expand('data/auxiliary/{dataset}/kmers/{{kmer}}/{id}/scores.{{mode}}.tsv',id=dataset_inputIDs[wildcards.dataset], allow_missing=True)
results = lambda wildcards: expand('data/auxiliary/{dataset}/
{iterset}/
kmers/{{kmer}}/{id}/scores.{{mode}}.tsv',id=dataset_inputIDs[wildcards.dataset], allow_missing=True)
output:
summary = report('data/output/{dataset}/kmers/{kmer}/predictions.{mode}.tsv',category="Spa-Type Predictions",caption="../report/prediction.snk")
summary = report('data/output/{dataset}/
{iterset}/
kmers/{kmer}/predictions.{mode}.tsv',category="Spa-Type Predictions",caption="../report/prediction.snk")
params:
# cluster execution
cpus = '1',
...
...
@@ -130,10 +130,10 @@ rule summarize:
rule metaSummarize:
input:
summary = expand('data/output/{dataset}/kmers/{kmer}/predictions.{{mode}}.tsv',kmer=kmer_lengths, allow_missing=True),
summary = expand('data/output/{dataset}/
{iterset}/
kmers/{kmer}/predictions.{{mode}}.tsv',kmer=kmer_lengths, allow_missing=True),
groundTruth = 'data/input/' + config['ground_truth']
output:
meta = 'data/output/{dataset}/metaSummary.{mode}.tsv'
meta = 'data/output/{dataset}/
{iterset}/
metaSummary.{mode}.tsv'
params:
# cluster execution
cpus = '1',
...
...
@@ -143,7 +143,7 @@ rule metaSummarize:
conda:
'../envs/biopythonworkbench.yaml'
log:
'logs/{dataset}/metaSummary.{mode}.log'
'logs/{dataset}/
{iterset}/
metaSummary.{mode}.log'
script:
'../scripts/metaSummarize.py'
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment