probabilistic.smk 8.22 KB
Newer Older
Philipp Spohr's avatar
Philipp Spohr committed
1
2
rule compareExpectedKmerProfileToTrueProfile:
    input:
Jan Hoeckesfeld's avatar
Jan Hoeckesfeld committed
3
4
        trueCounts = 'data/output/{dataset}/methodAnalysis/{id}/{kmer}/correctCounts.json',
        expectedCounts = 'data/auxiliary/{dataset}/kmers/{kmer}/{id}/expected_counts.json',
Philipp Spohr's avatar
Philipp Spohr committed
5
6
        groundTruthFile = 'data/input/' + config['ground_truth']
    output:
Jan Hoeckesfeld's avatar
Jan Hoeckesfeld committed
7
        differences = 'data/output/{dataset}/methodAnalysis/{id}/{kmer}/differences_expected.txt'
Philipp Spohr's avatar
Philipp Spohr committed
8
9
10
11
12
13
14
15
16
    params:
        inputFileID = lambda wildcards: wildcards.id
    conda:
        '../envs/biopythonworkbench.yaml'
    script:
        '../scripts/compareKmerCounts_expected.py'

rule calcPriorProbabilities:
    input:
Jan Hoeckesfeld's avatar
Jan Hoeckesfeld committed
17
        likelihoods = 'data/auxiliary/{dataset}/kmers/{kmer}/{id}/likelihoods.json'
Philipp Spohr's avatar
Philipp Spohr committed
18
    output:
Jan Hoeckesfeld's avatar
Jan Hoeckesfeld committed
19
        priorFilePath = 'data/auxiliary/{dataset}/kmers/{kmer}/{id}/prior.txt'
Philipp Spohr's avatar
Philipp Spohr committed
20
21
22
23
24
25
26
27
28
29
    params:
        k = lambda wildcards: wildcards.kmer,
        dps = config['dps'],
        cpus = '1',
        mem = '1G',
        gpus = '0',
        walltime = '00:05:00'
    conda:
        '../envs/biopythonworkbench.yaml'
    log:
Jan Hoeckesfeld's avatar
Jan Hoeckesfeld committed
30
        'logs/{dataset}/probabilistic/kmers/{kmer}/{id}/calcPrior.log'
Philipp Spohr's avatar
Philipp Spohr committed
31
32
33
34
35
36
    script:
        '../scripts/calcPriorProbabilities.py'


rule calcProbabilities:
    input:
Jan Hoeckesfeld's avatar
Jan Hoeckesfeld committed
37
38
        likelihoods = 'data/auxiliary/{dataset}/kmers/{kmer}/{id}/likelihoods.json',
        prior = 'data/auxiliary/{dataset}/kmers/{kmer}/{id}/prior.txt'
Philipp Spohr's avatar
Philipp Spohr committed
39
    output:
Jan Hoeckesfeld's avatar
Jan Hoeckesfeld committed
40
        probabilities = 'data/auxiliary/{dataset}/{iterset}/kmers/{kmer}/{id}/scores.probabilistic_gen.tsv'
Philipp Spohr's avatar
Philipp Spohr committed
41
42
43
44
45
46
47
48
49
    params:
        dps = config['dps'],
        cpus = '1',
        mem = '4G',
        gpus = '0',
        walltime = '00:05:00'
    conda:
        '../envs/biopythonworkbench.yaml'
    log:
Jan Hoeckesfeld's avatar
Jan Hoeckesfeld committed
50
        'logs/{dataset}/{iterset}/probabilistic/kmers/{kmer}/{id}/probabilities.log'
Philipp Spohr's avatar
Philipp Spohr committed
51
52
53
54
55
56
57
58
59
60
61
62
    script:
        '../scripts/calcSpaTypeProbabilities.py'




def extractTsvValue(filePath,line,nolabels=False):
    with open(filePath,'r') as infile:
        lines = infile.read().splitlines();
        return lines[line].split('\t')[0] if nolabels else lines[line].split('\t')[1]


Jan Hoeckesfeld's avatar
Jan Hoeckesfeld committed
63
ruleorder: calcLikelihoods_master > calcLikelihoods
Philipp Spohr's avatar
Philipp Spohr committed
64
65
rule calcLikelihoods:
    input:
Jan Hoeckesfeld's avatar
Jan Hoeckesfeld committed
66
67
68
        expected = 'data/auxiliary/{dataset}/kmers/{kmer}/{id}/expected_counts.json',
        observed = 'data/auxiliary/{dataset}/kmers/{kmer}/{id}/alignment.counts.json',
        kmerError = 'data/auxiliary/{dataset}/kmers/{kmer}/{id}/kmer_error.txt',
69
70
71
        kmerCoverageEstimate = determineKmerCoverageEstimateFile(),
        V_kmers_distances = 'data/auxiliary/kmers/{kmer}/V_kmers.distances.npz',
        V_kmers = 'data/auxiliary/kmers/{kmer}/V_kmers.json'
Philipp Spohr's avatar
Philipp Spohr committed
72
    output:
Jan Hoeckesfeld's avatar
Jan Hoeckesfeld committed
73
74
        likelihoods = 'data/auxiliary/{dataset}/{iterset}/kmers/{kmer}/{id}/likelihoods_cov.json',
        unexpectedLikelihoods = 'data/auxiliary/{dataset}/{iterset}/kmers/{kmer}/{id}/unexpected_likelihoods_cov.json'
Philipp Spohr's avatar
Philipp Spohr committed
75
76
        #diffs = 'data/auxiliary/kmers/{kmer}/{id}/kmer_diff.tsv'
    log:
Jan Hoeckesfeld's avatar
Jan Hoeckesfeld committed
77
        'logs/{dataset}/{iterset}/probabilistic/kmers/{kmer}/{id}/likelihoods_cov.log'
Philipp Spohr's avatar
Philipp Spohr committed
78
    benchmark:
Jan Hoeckesfeld's avatar
Jan Hoeckesfeld committed
79
        'benchmarks/{dataset}/{iterset}/probabilistic/kmers/{kmer}/{id}/calcLikelihoodsCoverageBasedModel.txt'
Philipp Spohr's avatar
Philipp Spohr committed
80
    params:
Jan Hoeckesfeld's avatar
Jan Hoeckesfeld committed
81
82
        e = lambda wildcards,input : extractTsvValue(input.kmerError,0),
        deviationCutoff = lambda wildcards,input : round(config['deviationCutoff']*extractCoverageEstimateFile(input.kmerCoverageEstimate,config)),
Jan Hoeckesfeld's avatar
Jan Hoeckesfeld committed
83
        itersetType = lambda wildcards: wildcards.iterset,
Philipp Spohr's avatar
Philipp Spohr committed
84
        #cluster exectuion
Jan Hoeckesfeld's avatar
Jan Hoeckesfeld committed
85
        cpus = '4',
Jan Hoeckesfeld's avatar
Jan Hoeckesfeld committed
86
        mem = '10G',
Philipp Spohr's avatar
Philipp Spohr committed
87
        gpus = '0',
Jan Hoeckesfeld's avatar
Jan Hoeckesfeld committed
88
        walltime = lambda wildcards, attempt: '01:00:00'
Philipp Spohr's avatar
Philipp Spohr committed
89
    singularity:
Jan Hoeckesfeld's avatar
Jan Hoeckesfeld committed
90
        'singularity_container/ckmertools_iterset.sif'
91
        #'docker://phspo/ckmertools:iterationset-tests'
Philipp Spohr's avatar
Philipp Spohr committed
92
    shell:
Jan Hoeckesfeld's avatar
Jan Hoeckesfeld committed
93
        'c_kmertools --e {input.expected} --c {params.cpus} --m 0 --o {input.observed} --kmererror {params.e} --d {params.deviationCutoff} --target {output.likelihoods} --unexpected {output.unexpectedLikelihoods} --log {log} --itersetType {params.itersetType} --hammingdist {input.V_kmers_distances} --kmersindex {input.V_kmers}'
Philipp Spohr's avatar
Philipp Spohr committed
94

Jan Hoeckesfeld's avatar
Jan Hoeckesfeld committed
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
rule calcLikelihoods_master:
    input:
        expected = 'data/auxiliary/{dataset}/kmers/{kmer}/{id}/expected_counts.json',
        observed = 'data/auxiliary/{dataset}/kmers/{kmer}/{id}/alignment.counts.json',
        kmerError = 'data/auxiliary/{dataset}/kmers/{kmer}/{id}/kmer_error.txt',
        kmerCoverageEstimate = determineKmerCoverageEstimateFile()
    output:
        likelihoods = 'data/auxiliary/{dataset}/master/kmers/{kmer}/{id}/likelihoods_cov.json',
        unexpectedLikelihoods = 'data/auxiliary/{dataset}/master/kmers/{kmer}/{id}/unexpected_likelihoods_cov.json'
        #diffs = 'data/auxiliary/kmers/{kmer}/{id}/kmer_diff.tsv'
    log:
        'logs/{dataset}/master/probabilistic/kmers/{kmer}/{id}/likelihoods_cov.log'
    benchmark:
        'benchmarks/{dataset}/master/probabilistic/kmers/{kmer}/{id}/calcLikelihoodsCoverageBasedModel.txt'
    params:
        e = (lambda wildcards,input : extractTsvValue(input.kmerError,0)),
        deviationCutoff = (lambda wildcards,input : round(config['deviationCutoff']*extractCoverageEstimateFile(input.kmerCoverageEstimate,config))),
        #cluster exectuion
Jan Hoeckesfeld's avatar
Jan Hoeckesfeld committed
113
        cpus = '4',
Jan Hoeckesfeld's avatar
Jan Hoeckesfeld committed
114
115
116
117
118
119
120
        mem = '4G',
        gpus = '0',
        walltime = '01:30:00'
    singularity:
        'docker://phspo/ckmertools:latest'
    shell:
        'c_kmertools --e {input.expected} --c {params.cpus} --m 0 --o {input.observed} --kmererror {params.e} --d {params.deviationCutoff} --target {output.likelihoods} --unexpected {output.unexpectedLikelihoods} --log {log}'
Philipp Spohr's avatar
Philipp Spohr committed
121
122
123
124

rule calcLikelihoods_Generative:
    input:
        counts = 'data/auxiliary/kmers/{kmer}/spaSequences.counts.json',
Jan Hoeckesfeld's avatar
Jan Hoeckesfeld committed
125
126
        observed = 'data/auxiliary/{dataset}/kmers/{kmer}/{id}/alignment.counts.json',
        baseError = 'data/auxiliary/{dataset}/{id}/base_error_estimate.txt'
Philipp Spohr's avatar
Philipp Spohr committed
127
    output:
Jan Hoeckesfeld's avatar
Jan Hoeckesfeld committed
128
        likelihoods = 'data/auxiliary/{dataset}/kmers/{kmer}/{id}/likelihoods.json'
Philipp Spohr's avatar
Philipp Spohr committed
129
130
131
132
133
134
135
136
137
    params:
        cpus = '1',
        mem = '3G',
        gpus = '0',
        walltime = '24:00:00',
        #cluster execution
        k = lambda wildcards: wildcards.kmer,
        e = lambda wildcards,input : extractTsvValue(input.baseError,0,True)
    singularity:
138
139
        'singularity_container/ckmertools_iterset.sif'
        #'docker://phspo/ckmertools:latest'
Philipp Spohr's avatar
Philipp Spohr committed
140
    log:
Jan Hoeckesfeld's avatar
Jan Hoeckesfeld committed
141
        'logs/{dataset}/probabilistic/kmers/{kmer}/{id}/likelihoods.log'
Philipp Spohr's avatar
Philipp Spohr committed
142
    benchmark:
Jan Hoeckesfeld's avatar
Jan Hoeckesfeld committed
143
        'benchmarks/{dataset}/probabilistic/kmers/{kmer}/{id}/calcLikelihoodsGenerativeModel.txt'
Philipp Spohr's avatar
Philipp Spohr committed
144
145
146
147
148
    shell:
        'c_kmertools --p {input.counts} --m 1 --c {params.cpus} --o {input.observed} --baseerror {params.e} --k {params.k} --target {output.likelihoods} --log {log}'

rule estimateErrorRates:
    input:
Jan Hoeckesfeld's avatar
Jan Hoeckesfeld committed
149
150
        read1 = 'data/auxiliary/{dataset}/{id}'+'.qc_internal_R1.fq',
        read2 = 'data/auxiliary/{dataset}/{id}'+'.qc_internal_R2.fq'
Philipp Spohr's avatar
Philipp Spohr committed
151
    output:
Jan Hoeckesfeld's avatar
Jan Hoeckesfeld committed
152
        baseError = 'data/auxiliary/{dataset}/{id}/base_error_estimate.txt'
Philipp Spohr's avatar
Philipp Spohr committed
153
    log:
Jan Hoeckesfeld's avatar
Jan Hoeckesfeld committed
154
        'logs/{dataset}/kmers/{id}/estimateErrorRates.log'
Philipp Spohr's avatar
Philipp Spohr committed
155
156
157
    params:
        # cluster execution
        cpus = '1',
Jan Hoeckesfeld's avatar
Jan Hoeckesfeld committed
158
        mem = '1G',
Philipp Spohr's avatar
Philipp Spohr committed
159
        gpus = '0',
Jan Hoeckesfeld's avatar
Jan Hoeckesfeld committed
160
        walltime = '00:10:00'
Philipp Spohr's avatar
Philipp Spohr committed
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
    conda:
        '../envs/biopythonworkbench.yaml'
    script:
        '../scripts/estimateErrorRates.py'

'''
rule calcAverageCoverage:
    input:
        alignment = 'data/auxiliary/{id}/alignment.sorted.bam'
    output:
        'data/auxiliary/{id}/averageCoverage.depth'
    conda:
        '../envs/main.yaml'
    shell:
        'samtools mpileup -B -d 10000 -q0 -Q0 -r maskref {input.alignment}  > {output}'
'''

####DEBUG RULES#####
rule calcKmerStats:
    input:
Jan Hoeckesfeld's avatar
Jan Hoeckesfeld committed
181
182
183
        expected = 'data/auxiliary/{dataset}/kmers/{kmer}/{id}/expected_counts.json',
        observed = 'data/auxiliary/{dataset}/kmers/{kmer}/{id}/alignment.counts.json',
        error = 'data/auxiliary/{dataset}/kmers/{kmer}/{id}/error_estimate.txt',
Philipp Spohr's avatar
Philipp Spohr committed
184
185
        coverage_estimate = determineKmerCoverageEstimateFile()
    output:
Jan Hoeckesfeld's avatar
Jan Hoeckesfeld committed
186
        stats = 'data/auxiliary/{dataset}/kmers/{kmer}/{id}/stats.tsv'
Philipp Spohr's avatar
Philipp Spohr committed
187
188
    params:
        id = lambda wildcards: wildcards.id,
Jan Hoeckesfeld's avatar
Jan Hoeckesfeld committed
189
190
191
        k = lambda wildcards: wildcards.kmer,
        # cluster execution
        cpus = '1',
Jan Hoeckesfeld's avatar
Jan Hoeckesfeld committed
192
        mem = '2G',
Jan Hoeckesfeld's avatar
Jan Hoeckesfeld committed
193
194
        gpus = '0',
        walltime = '00:30:00'
Philipp Spohr's avatar
Philipp Spohr committed
195
196
197
    conda:
        '../envs/biopythonworkbench.yaml'
    log:
Jan Hoeckesfeld's avatar
Jan Hoeckesfeld committed
198
        'logs/{dataset}/probabilistic/kmers/{kmer}/{id}/probabilities.log'
Philipp Spohr's avatar
Philipp Spohr committed
199
200
    script:
        '../scripts/calcKmerStats.py'