probabilistic.smk 6.57 KB
Newer Older
Philipp Spohr's avatar
Philipp Spohr committed
1
2
rule compareExpectedKmerProfileToTrueProfile:
    input:
Jan Hoeckesfeld's avatar
Jan Hoeckesfeld committed
3
4
        trueCounts = 'data/output/{dataset}/methodAnalysis/{id}/{kmer}/correctCounts.json',
        expectedCounts = 'data/auxiliary/{dataset}/kmers/{kmer}/{id}/expected_counts.json',
Philipp Spohr's avatar
Philipp Spohr committed
5
6
        groundTruthFile = 'data/input/' + config['ground_truth']
    output:
Jan Hoeckesfeld's avatar
Jan Hoeckesfeld committed
7
        differences = 'data/output/{dataset}/methodAnalysis/{id}/{kmer}/differences_expected.txt'
Philipp Spohr's avatar
Philipp Spohr committed
8
9
10
11
12
13
14
15
16
    params:
        inputFileID = lambda wildcards: wildcards.id
    conda:
        '../envs/biopythonworkbench.yaml'
    script:
        '../scripts/compareKmerCounts_expected.py'

rule calcPriorProbabilities:
    input:
Jan Hoeckesfeld's avatar
Jan Hoeckesfeld committed
17
        likelihoods = 'data/auxiliary/{dataset}/kmers/{kmer}/{id}/likelihoods.json'
Philipp Spohr's avatar
Philipp Spohr committed
18
    output:
Jan Hoeckesfeld's avatar
Jan Hoeckesfeld committed
19
        priorFilePath = 'data/auxiliary/{dataset}/kmers/{kmer}/{id}/prior.txt'
Philipp Spohr's avatar
Philipp Spohr committed
20
21
22
23
24
25
26
27
28
29
    params:
        k = lambda wildcards: wildcards.kmer,
        dps = config['dps'],
        cpus = '1',
        mem = '1G',
        gpus = '0',
        walltime = '00:05:00'
    conda:
        '../envs/biopythonworkbench.yaml'
    log:
Jan Hoeckesfeld's avatar
Jan Hoeckesfeld committed
30
        'logs/{dataset}/probabilistic/kmers/{kmer}/{id}/calcPrior.log'
Philipp Spohr's avatar
Philipp Spohr committed
31
32
33
34
35
36
    script:
        '../scripts/calcPriorProbabilities.py'


rule calcProbabilities:
    input:
Jan Hoeckesfeld's avatar
Jan Hoeckesfeld committed
37
38
        likelihoods = 'data/auxiliary/{dataset}/kmers/{kmer}/{id}/likelihoods.json',
        prior = 'data/auxiliary/{dataset}/kmers/{kmer}/{id}/prior.txt'
Philipp Spohr's avatar
Philipp Spohr committed
39
    output:
Jan Hoeckesfeld's avatar
Jan Hoeckesfeld committed
40
        probabilities = 'data/auxiliary/{dataset}/{iterset}/kmers/{kmer}/{id}/scores.probabilistic_gen.tsv'
Philipp Spohr's avatar
Philipp Spohr committed
41
42
43
44
45
46
47
48
49
    params:
        dps = config['dps'],
        cpus = '1',
        mem = '4G',
        gpus = '0',
        walltime = '00:05:00'
    conda:
        '../envs/biopythonworkbench.yaml'
    log:
Jan Hoeckesfeld's avatar
Jan Hoeckesfeld committed
50
        'logs/{dataset}/{iterset}/probabilistic/kmers/{kmer}/{id}/probabilities.log'
Philipp Spohr's avatar
Philipp Spohr committed
51
52
53
54
55
56
57
58
59
60
61
62
63
64
    script:
        '../scripts/calcSpaTypeProbabilities.py'




def extractTsvValue(filePath,line,nolabels=False):
    with open(filePath,'r') as infile:
        lines = infile.read().splitlines();
        return lines[line].split('\t')[0] if nolabels else lines[line].split('\t')[1]


rule calcLikelihoods:
    input:
Jan Hoeckesfeld's avatar
Jan Hoeckesfeld committed
65
66
67
        expected = 'data/auxiliary/{dataset}/kmers/{kmer}/{id}/expected_counts.json',
        observed = 'data/auxiliary/{dataset}/kmers/{kmer}/{id}/alignment.counts.json',
        kmerError = 'data/auxiliary/{dataset}/kmers/{kmer}/{id}/kmer_error.txt',
68
69
70
        kmerCoverageEstimate = determineKmerCoverageEstimateFile(),
        V_kmers_distances = 'data/auxiliary/kmers/{kmer}/V_kmers.distances.npz',
        V_kmers = 'data/auxiliary/kmers/{kmer}/V_kmers.json'
Philipp Spohr's avatar
Philipp Spohr committed
71
    output:
Jan Hoeckesfeld's avatar
Jan Hoeckesfeld committed
72
73
        likelihoods = 'data/auxiliary/{dataset}/{iterset}/kmers/{kmer}/{id}/likelihoods_cov.json',
        unexpectedLikelihoods = 'data/auxiliary/{dataset}/{iterset}/kmers/{kmer}/{id}/unexpected_likelihoods_cov.json'
Philipp Spohr's avatar
Philipp Spohr committed
74
75
        #diffs = 'data/auxiliary/kmers/{kmer}/{id}/kmer_diff.tsv'
    log:
Jan Hoeckesfeld's avatar
Jan Hoeckesfeld committed
76
        'logs/{dataset}/{iterset}/probabilistic/kmers/{kmer}/{id}/likelihoods_cov.log'
Philipp Spohr's avatar
Philipp Spohr committed
77
    benchmark:
Jan Hoeckesfeld's avatar
Jan Hoeckesfeld committed
78
        'benchmarks/{dataset}/{iterset}/probabilistic/kmers/{kmer}/{id}/calcLikelihoodsCoverageBasedModel.txt'
Philipp Spohr's avatar
Philipp Spohr committed
79
    params:
Jan Hoeckesfeld's avatar
Jan Hoeckesfeld committed
80
81
        e = lambda wildcards,input : extractTsvValue(input.kmerError,0),
        deviationCutoff = lambda wildcards,input : round(config['deviationCutoff']*extractCoverageEstimateFile(input.kmerCoverageEstimate,config)),
Jan Hoeckesfeld's avatar
Jan Hoeckesfeld committed
82
        itersetType = lambda wildcards: wildcards.iterset,
Philipp Spohr's avatar
Philipp Spohr committed
83
        #cluster exectuion
Jan Hoeckesfeld's avatar
Jan Hoeckesfeld committed
84
        cpus = '4',
Jan Hoeckesfeld's avatar
Jan Hoeckesfeld committed
85
        mem = '10G',
Philipp Spohr's avatar
Philipp Spohr committed
86
        gpus = '0',
Jan Hoeckesfeld's avatar
Jan Hoeckesfeld committed
87
        walltime = lambda wildcards, attempt: '01:00:00'
Philipp Spohr's avatar
Philipp Spohr committed
88
    singularity:
Jan Hoeckesfeld's avatar
Jan Hoeckesfeld committed
89
        'singularity_container/ckmertools_iterset.sif'
90
        #'docker://phspo/ckmertools:iterationset-tests'
Philipp Spohr's avatar
Philipp Spohr committed
91
    shell:
Jan Hoeckesfeld's avatar
Jan Hoeckesfeld committed
92
        'c_kmertools --e {input.expected} --c {params.cpus} --m 0 --o {input.observed} --kmererror {params.e} --d {params.deviationCutoff} --target {output.likelihoods} --unexpected {output.unexpectedLikelihoods} --log {log} --itersetType {params.itersetType} --hammingdist {input.V_kmers_distances} --kmersindex {input.V_kmers}'
Philipp Spohr's avatar
Philipp Spohr committed
93
94
95
96
97


rule calcLikelihoods_Generative:
    input:
        counts = 'data/auxiliary/kmers/{kmer}/spaSequences.counts.json',
Jan Hoeckesfeld's avatar
Jan Hoeckesfeld committed
98
99
        observed = 'data/auxiliary/{dataset}/kmers/{kmer}/{id}/alignment.counts.json',
        baseError = 'data/auxiliary/{dataset}/{id}/base_error_estimate.txt'
Philipp Spohr's avatar
Philipp Spohr committed
100
    output:
Jan Hoeckesfeld's avatar
Jan Hoeckesfeld committed
101
        likelihoods = 'data/auxiliary/{dataset}/kmers/{kmer}/{id}/likelihoods.json'
Philipp Spohr's avatar
Philipp Spohr committed
102
103
104
105
106
107
108
109
110
    params:
        cpus = '1',
        mem = '3G',
        gpus = '0',
        walltime = '24:00:00',
        #cluster execution
        k = lambda wildcards: wildcards.kmer,
        e = lambda wildcards,input : extractTsvValue(input.baseError,0,True)
    singularity:
111
112
        'singularity_container/ckmertools_iterset.sif'
        #'docker://phspo/ckmertools:latest'
Philipp Spohr's avatar
Philipp Spohr committed
113
    log:
Jan Hoeckesfeld's avatar
Jan Hoeckesfeld committed
114
        'logs/{dataset}/probabilistic/kmers/{kmer}/{id}/likelihoods.log'
Philipp Spohr's avatar
Philipp Spohr committed
115
    benchmark:
Jan Hoeckesfeld's avatar
Jan Hoeckesfeld committed
116
        'benchmarks/{dataset}/probabilistic/kmers/{kmer}/{id}/calcLikelihoodsGenerativeModel.txt'
Philipp Spohr's avatar
Philipp Spohr committed
117
118
119
120
121
    shell:
        'c_kmertools --p {input.counts} --m 1 --c {params.cpus} --o {input.observed} --baseerror {params.e} --k {params.k} --target {output.likelihoods} --log {log}'

rule estimateErrorRates:
    input:
Jan Hoeckesfeld's avatar
Jan Hoeckesfeld committed
122
123
        read1 = 'data/auxiliary/{dataset}/{id}'+'.qc_internal_R1.fq',
        read2 = 'data/auxiliary/{dataset}/{id}'+'.qc_internal_R2.fq'
Philipp Spohr's avatar
Philipp Spohr committed
124
    output:
Jan Hoeckesfeld's avatar
Jan Hoeckesfeld committed
125
        baseError = 'data/auxiliary/{dataset}/{id}/base_error_estimate.txt'
Philipp Spohr's avatar
Philipp Spohr committed
126
    log:
Jan Hoeckesfeld's avatar
Jan Hoeckesfeld committed
127
        'logs/{dataset}/kmers/{id}/estimateErrorRates.log'
Philipp Spohr's avatar
Philipp Spohr committed
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
    params:
        # cluster execution
        cpus = '1',
        mem = '32G',
        gpus = '0',
        walltime = '00:30:00'
    conda:
        '../envs/biopythonworkbench.yaml'
    script:
        '../scripts/estimateErrorRates.py'

'''
rule calcAverageCoverage:
    input:
        alignment = 'data/auxiliary/{id}/alignment.sorted.bam'
    output:
        'data/auxiliary/{id}/averageCoverage.depth'
    conda:
        '../envs/main.yaml'
    shell:
        'samtools mpileup -B -d 10000 -q0 -Q0 -r maskref {input.alignment}  > {output}'
'''

####DEBUG RULES#####
rule calcKmerStats:
    input:
Jan Hoeckesfeld's avatar
Jan Hoeckesfeld committed
154
155
156
        expected = 'data/auxiliary/{dataset}/kmers/{kmer}/{id}/expected_counts.json',
        observed = 'data/auxiliary/{dataset}/kmers/{kmer}/{id}/alignment.counts.json',
        error = 'data/auxiliary/{dataset}/kmers/{kmer}/{id}/error_estimate.txt',
Philipp Spohr's avatar
Philipp Spohr committed
157
158
        coverage_estimate = determineKmerCoverageEstimateFile()
    output:
Jan Hoeckesfeld's avatar
Jan Hoeckesfeld committed
159
        stats = 'data/auxiliary/{dataset}/kmers/{kmer}/{id}/stats.tsv'
Philipp Spohr's avatar
Philipp Spohr committed
160
161
162
163
164
165
    params:
        id = lambda wildcards: wildcards.id,
        k = lambda wildcards: wildcards.kmer
    conda:
        '../envs/biopythonworkbench.yaml'
    log:
Jan Hoeckesfeld's avatar
Jan Hoeckesfeld committed
166
        'logs/{dataset}/probabilistic/kmers/{kmer}/{id}/probabilities.log'
Philipp Spohr's avatar
Philipp Spohr committed
167
168
    script:
        '../scripts/calcKmerStats.py'