probabilistic.smk 8.08 KB
Newer Older
Philipp Spohr's avatar
Philipp Spohr committed
1
2
rule compareExpectedKmerProfileToTrueProfile:
    input:
Jan Hoeckesfeld's avatar
Jan Hoeckesfeld committed
3
4
        trueCounts = 'data/output/{dataset}/methodAnalysis/{id}/{kmer}/correctCounts.json',
        expectedCounts = 'data/auxiliary/{dataset}/kmers/{kmer}/{id}/expected_counts.json',
Philipp Spohr's avatar
Philipp Spohr committed
5
6
        groundTruthFile = 'data/input/' + config['ground_truth']
    output:
Jan Hoeckesfeld's avatar
Jan Hoeckesfeld committed
7
        differences = 'data/output/{dataset}/methodAnalysis/{id}/{kmer}/differences_expected.txt'
Philipp Spohr's avatar
Philipp Spohr committed
8
9
10
11
12
13
14
15
16
    params:
        inputFileID = lambda wildcards: wildcards.id
    conda:
        '../envs/biopythonworkbench.yaml'
    script:
        '../scripts/compareKmerCounts_expected.py'

rule calcPriorProbabilities:
    input:
Jan Hoeckesfeld's avatar
Jan Hoeckesfeld committed
17
        likelihoods = 'data/auxiliary/{dataset}/kmers/{kmer}/{id}/likelihoods.json'
Philipp Spohr's avatar
Philipp Spohr committed
18
    output:
Jan Hoeckesfeld's avatar
Jan Hoeckesfeld committed
19
        priorFilePath = 'data/auxiliary/{dataset}/kmers/{kmer}/{id}/prior.txt'
Philipp Spohr's avatar
Philipp Spohr committed
20
21
22
23
24
25
26
27
28
29
    params:
        k = lambda wildcards: wildcards.kmer,
        dps = config['dps'],
        cpus = '1',
        mem = '1G',
        gpus = '0',
        walltime = '00:05:00'
    conda:
        '../envs/biopythonworkbench.yaml'
    log:
Jan Hoeckesfeld's avatar
Jan Hoeckesfeld committed
30
        'logs/{dataset}/probabilistic/kmers/{kmer}/{id}/calcPrior.log'
Philipp Spohr's avatar
Philipp Spohr committed
31
32
33
34
35
36
    script:
        '../scripts/calcPriorProbabilities.py'


rule calcProbabilities:
    input:
Jan Hoeckesfeld's avatar
Jan Hoeckesfeld committed
37
38
        likelihoods = 'data/auxiliary/{dataset}/kmers/{kmer}/{id}/likelihoods.json',
        prior = 'data/auxiliary/{dataset}/kmers/{kmer}/{id}/prior.txt'
Philipp Spohr's avatar
Philipp Spohr committed
39
    output:
Jan Hoeckesfeld's avatar
Jan Hoeckesfeld committed
40
        probabilities = 'data/auxiliary/{dataset}/{iterset}/kmers/{kmer}/{id}/scores.probabilistic_gen.tsv'
Philipp Spohr's avatar
Philipp Spohr committed
41
42
43
44
45
46
47
48
49
    params:
        dps = config['dps'],
        cpus = '1',
        mem = '4G',
        gpus = '0',
        walltime = '00:05:00'
    conda:
        '../envs/biopythonworkbench.yaml'
    log:
Jan Hoeckesfeld's avatar
Jan Hoeckesfeld committed
50
        'logs/{dataset}/{iterset}/probabilistic/kmers/{kmer}/{id}/probabilities.log'
Philipp Spohr's avatar
Philipp Spohr committed
51
52
53
54
55
56
57
58
59
60
61
62
63
64
    script:
        '../scripts/calcSpaTypeProbabilities.py'




def extractTsvValue(filePath,line,nolabels=False):
    with open(filePath,'r') as infile:
        lines = infile.read().splitlines();
        return lines[line].split('\t')[0] if nolabels else lines[line].split('\t')[1]


rule calcLikelihoods:
    input:
Jan Hoeckesfeld's avatar
Jan Hoeckesfeld committed
65
66
67
        expected = 'data/auxiliary/{dataset}/kmers/{kmer}/{id}/expected_counts.json',
        observed = 'data/auxiliary/{dataset}/kmers/{kmer}/{id}/alignment.counts.json',
        kmerError = 'data/auxiliary/{dataset}/kmers/{kmer}/{id}/kmer_error.txt',
68
69
70
        kmerCoverageEstimate = determineKmerCoverageEstimateFile(),
        V_kmers_distances = 'data/auxiliary/kmers/{kmer}/V_kmers.distances.npz',
        V_kmers = 'data/auxiliary/kmers/{kmer}/V_kmers.json'
Philipp Spohr's avatar
Philipp Spohr committed
71
    output:
Jan Hoeckesfeld's avatar
Jan Hoeckesfeld committed
72
73
        likelihoods = 'data/auxiliary/{dataset}/{iterset}/kmers/{kmer}/{id}/likelihoods_cov.json',
        unexpectedLikelihoods = 'data/auxiliary/{dataset}/{iterset}/kmers/{kmer}/{id}/unexpected_likelihoods_cov.json'
Philipp Spohr's avatar
Philipp Spohr committed
74
        #diffs = 'data/auxiliary/kmers/{kmer}/{id}/kmer_diff.tsv'
Jan Hoeckesfeld's avatar
Jan Hoeckesfeld committed
75
    priority: 0
Philipp Spohr's avatar
Philipp Spohr committed
76
    log:
Jan Hoeckesfeld's avatar
Jan Hoeckesfeld committed
77
        'logs/{dataset}/{iterset}/probabilistic/kmers/{kmer}/{id}/likelihoods_cov.log'
Philipp Spohr's avatar
Philipp Spohr committed
78
    benchmark:
Jan Hoeckesfeld's avatar
Jan Hoeckesfeld committed
79
        'benchmarks/{dataset}/{iterset}/probabilistic/kmers/{kmer}/{id}/calcLikelihoodsCoverageBasedModel.txt'
Philipp Spohr's avatar
Philipp Spohr committed
80
    params:
Jan Hoeckesfeld's avatar
Jan Hoeckesfeld committed
81
82
        e = lambda wildcards,input : extractTsvValue(input.kmerError,0),
        deviationCutoff = lambda wildcards,input : round(config['deviationCutoff']*extractCoverageEstimateFile(input.kmerCoverageEstimate,config)),
Jan Hoeckesfeld's avatar
Jan Hoeckesfeld committed
83
        itersetType = lambda wildcards: wildcards.iterset,
Philipp Spohr's avatar
Philipp Spohr committed
84
        #cluster exectuion
Jan Hoeckesfeld's avatar
Jan Hoeckesfeld committed
85
        cpus = '4',
Jan Hoeckesfeld's avatar
Jan Hoeckesfeld committed
86
        mem = '10G',
Philipp Spohr's avatar
Philipp Spohr committed
87
        gpus = '0',
Jan Hoeckesfeld's avatar
Jan Hoeckesfeld committed
88
        walltime = lambda wildcards, attempt: '01:00:00'
Philipp Spohr's avatar
Philipp Spohr committed
89
    singularity:
Jan Hoeckesfeld's avatar
Jan Hoeckesfeld committed
90
        'singularity_container/ckmertools_iterset.sif'
91
        #'docker://phspo/ckmertools:iterationset-tests'
Philipp Spohr's avatar
Philipp Spohr committed
92
    shell:
Jan Hoeckesfeld's avatar
Jan Hoeckesfeld committed
93
        'c_kmertools --e {input.expected} --c {params.cpus} --m 0 --o {input.observed} --kmererror {params.e} --d {params.deviationCutoff} --target {output.likelihoods} --unexpected {output.unexpectedLikelihoods} --log {log} --itersetType {params.itersetType} --hammingdist {input.V_kmers_distances} --kmersindex {input.V_kmers}'
Philipp Spohr's avatar
Philipp Spohr committed
94

Jan Hoeckesfeld's avatar
Jan Hoeckesfeld committed
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
rule calcLikelihoods_master:
    input:
        expected = 'data/auxiliary/{dataset}/kmers/{kmer}/{id}/expected_counts.json',
        observed = 'data/auxiliary/{dataset}/kmers/{kmer}/{id}/alignment.counts.json',
        kmerError = 'data/auxiliary/{dataset}/kmers/{kmer}/{id}/kmer_error.txt',
        kmerCoverageEstimate = determineKmerCoverageEstimateFile()
    output:
        likelihoods = 'data/auxiliary/{dataset}/master/kmers/{kmer}/{id}/likelihoods_cov.json',
        unexpectedLikelihoods = 'data/auxiliary/{dataset}/master/kmers/{kmer}/{id}/unexpected_likelihoods_cov.json'
        #diffs = 'data/auxiliary/kmers/{kmer}/{id}/kmer_diff.tsv'
    priority: 1
    log:
        'logs/{dataset}/master/probabilistic/kmers/{kmer}/{id}/likelihoods_cov.log'
    benchmark:
        'benchmarks/{dataset}/master/probabilistic/kmers/{kmer}/{id}/calcLikelihoodsCoverageBasedModel.txt'
    params:
        e = (lambda wildcards,input : extractTsvValue(input.kmerError,0)),
        deviationCutoff = (lambda wildcards,input : round(config['deviationCutoff']*extractCoverageEstimateFile(input.kmerCoverageEstimate,config))),
        #cluster exectuion
        cpus = '1',
        mem = '4G',
        gpus = '0',
        walltime = '01:30:00'
    singularity:
        'docker://phspo/ckmertools:latest'
    shell:
        'c_kmertools --e {input.expected} --c {params.cpus} --m 0 --o {input.observed} --kmererror {params.e} --d {params.deviationCutoff} --target {output.likelihoods} --unexpected {output.unexpectedLikelihoods} --log {log}'
Philipp Spohr's avatar
Philipp Spohr committed
122
123
124
125

rule calcLikelihoods_Generative:
    input:
        counts = 'data/auxiliary/kmers/{kmer}/spaSequences.counts.json',
Jan Hoeckesfeld's avatar
Jan Hoeckesfeld committed
126
127
        observed = 'data/auxiliary/{dataset}/kmers/{kmer}/{id}/alignment.counts.json',
        baseError = 'data/auxiliary/{dataset}/{id}/base_error_estimate.txt'
Philipp Spohr's avatar
Philipp Spohr committed
128
    output:
Jan Hoeckesfeld's avatar
Jan Hoeckesfeld committed
129
        likelihoods = 'data/auxiliary/{dataset}/kmers/{kmer}/{id}/likelihoods.json'
Philipp Spohr's avatar
Philipp Spohr committed
130
131
132
133
134
135
136
137
138
    params:
        cpus = '1',
        mem = '3G',
        gpus = '0',
        walltime = '24:00:00',
        #cluster execution
        k = lambda wildcards: wildcards.kmer,
        e = lambda wildcards,input : extractTsvValue(input.baseError,0,True)
    singularity:
139
140
        'singularity_container/ckmertools_iterset.sif'
        #'docker://phspo/ckmertools:latest'
Philipp Spohr's avatar
Philipp Spohr committed
141
    log:
Jan Hoeckesfeld's avatar
Jan Hoeckesfeld committed
142
        'logs/{dataset}/probabilistic/kmers/{kmer}/{id}/likelihoods.log'
Philipp Spohr's avatar
Philipp Spohr committed
143
    benchmark:
Jan Hoeckesfeld's avatar
Jan Hoeckesfeld committed
144
        'benchmarks/{dataset}/probabilistic/kmers/{kmer}/{id}/calcLikelihoodsGenerativeModel.txt'
Philipp Spohr's avatar
Philipp Spohr committed
145
146
147
148
149
    shell:
        'c_kmertools --p {input.counts} --m 1 --c {params.cpus} --o {input.observed} --baseerror {params.e} --k {params.k} --target {output.likelihoods} --log {log}'

rule estimateErrorRates:
    input:
Jan Hoeckesfeld's avatar
Jan Hoeckesfeld committed
150
151
        read1 = 'data/auxiliary/{dataset}/{id}'+'.qc_internal_R1.fq',
        read2 = 'data/auxiliary/{dataset}/{id}'+'.qc_internal_R2.fq'
Philipp Spohr's avatar
Philipp Spohr committed
152
    output:
Jan Hoeckesfeld's avatar
Jan Hoeckesfeld committed
153
        baseError = 'data/auxiliary/{dataset}/{id}/base_error_estimate.txt'
Philipp Spohr's avatar
Philipp Spohr committed
154
    log:
Jan Hoeckesfeld's avatar
Jan Hoeckesfeld committed
155
        'logs/{dataset}/kmers/{id}/estimateErrorRates.log'
Philipp Spohr's avatar
Philipp Spohr committed
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
    params:
        # cluster execution
        cpus = '1',
        mem = '32G',
        gpus = '0',
        walltime = '00:30:00'
    conda:
        '../envs/biopythonworkbench.yaml'
    script:
        '../scripts/estimateErrorRates.py'

'''
rule calcAverageCoverage:
    input:
        alignment = 'data/auxiliary/{id}/alignment.sorted.bam'
    output:
        'data/auxiliary/{id}/averageCoverage.depth'
    conda:
        '../envs/main.yaml'
    shell:
        'samtools mpileup -B -d 10000 -q0 -Q0 -r maskref {input.alignment}  > {output}'
'''

####DEBUG RULES#####
rule calcKmerStats:
    input:
Jan Hoeckesfeld's avatar
Jan Hoeckesfeld committed
182
183
184
        expected = 'data/auxiliary/{dataset}/kmers/{kmer}/{id}/expected_counts.json',
        observed = 'data/auxiliary/{dataset}/kmers/{kmer}/{id}/alignment.counts.json',
        error = 'data/auxiliary/{dataset}/kmers/{kmer}/{id}/error_estimate.txt',
Philipp Spohr's avatar
Philipp Spohr committed
185
186
        coverage_estimate = determineKmerCoverageEstimateFile()
    output:
Jan Hoeckesfeld's avatar
Jan Hoeckesfeld committed
187
        stats = 'data/auxiliary/{dataset}/kmers/{kmer}/{id}/stats.tsv'
Philipp Spohr's avatar
Philipp Spohr committed
188
189
190
191
192
193
    params:
        id = lambda wildcards: wildcards.id,
        k = lambda wildcards: wildcards.kmer
    conda:
        '../envs/biopythonworkbench.yaml'
    log:
Jan Hoeckesfeld's avatar
Jan Hoeckesfeld committed
194
        'logs/{dataset}/probabilistic/kmers/{kmer}/{id}/probabilities.log'
Philipp Spohr's avatar
Philipp Spohr committed
195
196
    script:
        '../scripts/calcKmerStats.py'