Commit c7246bda authored by Philipp Spohr's avatar Philipp Spohr
Browse files

Added software

parent 7ffe33b6
Copyright 2020 Philipp Spohr
Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
\ No newline at end of file
Snakemake workflow for spa-typing of Next-Generation Sequencing data.
Execution of the workflow requires Snakemake, Conda and Singularity.
# Execution
Input reads (.fastq) should be moved to (or symlinked into) a folder in data/input.
Add a reference genome folder to the data/input folder containing a .fasta file as well as a gene table that contains an entry for protein A.
Add a spa repeat definitions file in .fasta format as well as a spa types definition file to the data/input folder. (Both files can be downloaded from https://spaserver.ridom.de/)
Create a copy of the config.example.yaml
​ mv config.example.yaml config.yaml
and modify the following entries:
spa_repeats_file (change to the name of your spa repeat definition file)
spa_types (change to the name of your spa type definition file)
input_folder (change to the name of your input reads folder)
input_read_1_ending, input_read_2_ending (change to the file endings that mark your samples)
Execute with
​ snakemake --use-conda --use-singularity --cores 8
to run with 8 cores
include: "scripts/shared.py"
from snakemake.utils import validate
#Validate configuration files
configfile: "config.yaml"
validate(config, "schemas/config.schema.yaml")
#Generate Input/Output Files from specified folder
inputIDs, = glob_wildcards('data/input/'+config['input_folder']+'/{id}'+config['input_read_1_ending'])
kmer_lengths = config['kmers']
#kmer_lengths = [24]
#Helper function that assembles required input files dependent on configuration settings
def get_input():
input_list = []
if config['generative_model']:
input_list.append(expand('data/output/'+config['input_folder']+'/kmers/{kmer}/predictions.probabilistic_gen.tsv',kmer=kmer_lengths))
if config['probabilistic_model']:
input_list.append(expand('data/output/'+config['input_folder']+'/kmers/{kmer}/predictions.probabilistic_cov.tsv',kmer=kmer_lengths))
if config['plot_top3_fit']:
input_list.append(expand('data/output/'+config['input_folder']+'/kmers/{kmer}/{id}_top3fit.svg',kmer=kmer_lengths,id=inputIDs))
if config['distance_model']:
input_list.append(expand('data/output/'+config['input_folder']+'/kmers/{kmer}/predictions.euclidean.tsv',kmer=kmer_lengths))
if config['assembly_model']:
input_list.append(expand('data/output/'+config['input_folder']+'/{id}/exactMatches.tsv',id=inputIDs))
if config['calc_strand_bias']:
input_list.append(expand('data/output/'+config['input_folder']+'/{id}/strandbias.txt',id=inputIDs))
if config['mapping_diff_analysis']:
input_list.append(expand('data/output/'+config['input_folder']+'/methodAnalysis/{id}/mapping.comparison',id=inputIDs))
if config['map_filtered_reads']:
input_list.append(expand('data/output/'+config['input_folder']+'/methodAnalysis/{id}/alignmentToGroundTruthType.sorted.bam.bai',id=inputIDs))
if config['verifyUniqueness']:
input_list.append(expand('data/output/kmers/{kmer}/uniquenessTest.tsv',kmer=kmer_lengths))
if config['kmer_stats_analysis']:
input_list.append(expand('data/auxiliary/'+config['input_folder']+'/kmers/{kmer}/{id}/stats.tsv',kmer=kmer_lengths,id=inputIDs))
return input_list
rule all:
input:
get_input()
##### load rules #####
include: "rules/assembly.snk"
include: "rules/shared.snk"
include: "rules/kmerApproach.snk"
include: "rules/coverageBased.snk"
include: "rules/probabilistic.snk"
include: "rules/euclidean.snk"
#!/bin/bash
#Load Conda and Snakemake and Singularity for cluster execution if a module system exists on the HPC used for computation
#Depending on your HPC system's architecture you might have to adjust this file
module load Singularity/3.5.2
module load Miniconda/3_snakemake
module load Snakemake/5.10.0
#properties = {properties}
{exec_job}
#!/bin/bash
module load Singularity/3.5.2
module load Miniconda/3_snakemake
module load Snakemake/5.10.0
while getopts "p:j:" opt
do
case $opt in
p) projectID="$OPTARG" ;;
j) maxNrOfConcurrentJobs="$OPTARG" ;;
esac
done
if [ -z "$projectID" ] || [ -z "$maxNrOfConcurrentJobs" ] #|| [ -z "$reportFile" ]
then
echo "Usage: clusterExecution -p projectID -j maxNrOfConcurrentJobs -r reportFile"
exit 1
fi
mkdir -p clusterLogs
type snakemake >/dev/null 2>&1 || { echo >&2 "I require snakemake but it's not installed or added to your path. Aborting..."; exit 1; }
snakemake --jobs $maxNrOfConcurrentJobs --use-singularity --use-conda --reason --jobscript cluster/jobscript.sh --cluster "qsub -e clusterLogs/{rule}.{wildcards}.{jobid}.errors -o clusterLogs/{rule}.{wildcards}.{jobid}.output -A ${projectID} -l select=1:ncpus={params.cpus}:ngpus={params.gpus}:mem={params.mem} -l walltime={params.walltime}"
###Static Files
#Repeat definitions (as downloaded from ridom spa server), relative to data/input
spa_repeats_file : sparepeats.fasta
#Spa-type definitions (as downloaded from ridom spa server), relative to data/input
spa_types : spatypes.txt
#If true only a subset of spa-types is used (this can be useful if you only want to decide between certain types)
useSubset : False
#A list of comma separated spa-types that should be considered for the analysis
#We included one file containing the 100 most common spa-types in the repository as a demo
subsetFile: top_01_04_2020.csv
###Prediction Modes
#Poisson-Based Model: Recommended
probabilistic_model : True
#Generative Model
generative_model : False
#Euclidean Distance Model
distance_model : False
#Assembly based mode, experimental and not recommended
assembly_model : False
###Reference Genome
reference_genome : NCTC8325
genome_file_identifier : genome.fa #NCTC8325
#genome_file_identifier : genome.fna
protein_table_identifier : protein.table
reference_guided_assembly : False
reference_genome_table_index_organism_id : 6 #NCTC8325
reference_genome_table_index_start_pos : 7 #H-EMRSA-15 #NCTC8325
reference_genome_table_index_end_pos : 8 #H-EMRSA-15 #NCTC8325
reference_genome_table_index_strand : 9 #H-EMRSA-15 #NCTC8325
reference_genome_table_index_protein_id : 13 #H-EMRSA-15 #NCTC8325
protein_a_identifier : protein A #NCTC8325
l_lactate_permease_identifier : L-lactate permease #NCTC8325
arlp_identifier : accessory regulator-like protein #NCTC8325
input_folder : test
input_read_1_ending : _1.fq
input_read_2_ending : _2.fq
### Method Analysis
# Calculate the strand bias based on alignments
calc_strand_bias : False
### Ground Truth Analysis
#A tab-separated file where column 1 contains the sample name and column 2 the correct spa-type
ground_truth : groundTruth.example.tsv
#Map the filtered (assumed X region) reads against the ground truth reference
map_filtered_reads : False
#Only works for simulated reads with wgsim: Analyzes how many reads got correctly mapped to the X region
mapping_diff_analysis: False
### k-mer Approach
kmers: [38]
#Test whether k-mers are unique to the Protein A region for the given k. This can be used as an indicator when the skipMapping option could be viable
verifyUniqueness : False
kmer_stats_analysis : False
#The precision that is used for log-space additions and other high precision requiring math operations
dps : 100
### Coverage-Based Method
#Determine which method is used for k-mer coverage estimation
#This affects only the COV method
#alignment -> k-mer coverage is estimated based on read coverage found in an alignment to a reference genome
#countPeak -> k-mer coverage is based on a k-mer frequency histogram where a poisson-distribution is assumed and the first peak (after cutting off error k-mers) is believed to be the single k-mer coverage
#countMean -> k-mer coverage is based on a k-mer frequency histogram where the average k-mer coverage is estimated by the mean of k-mer frequencies (again, cutting off error k-mers)
kmerCoverageEstimationMethod : countPoisson
#when calclating probabilities based on COV method, skip a tpye as soon as the deviation between expected and observed exceeds (deviationCutoff*kmerCoverageEstimate) for any k-mer
deviationCutoff : 2.5
#likelihoods exceeding this value are not taken into account when calculating a prior
#likelihoodCutoff : -50000
#If the k chosen is sufficiently large, k-mers can be unique to the protein A region. In this case mapping can be skipped and all k-mers detected in the reads can be assumed to have originated from the protein A region. The skipMapping option skips the entire mapping process and generates k-mer profiles for the input data directly from the reads.
#This applies to the methods: COV and EUC, the method GEN is unaffected by this
skipMapping: False
plot_top3_fit : False
###Blast Parameter
blast_word_size : 4
blast_word_size_hypProtA : 11
blast_e_value : 1.0e-40
blast_e_value_hypProtA : 1.0e-50
sample1 t032
sample2 t032
sample3 t1337
> Artificial Protein A
TTGAAAAAGAAAAACATTTATTCAATTCGTAAACTAGGTGTAGGTATTGCATCTGTAACTTTAGGTACATTACTTATATCTGGTGGCGTAACACCTGCTGCAAATGCTGCGCAACACGATGAAGCTCAACAAAATGCTTTTTATCAAGTCTTAAATATGCCTAACTTAAATGCTGATCAACGCAATGGTTTTATCCAAAGCCTTAAAGATGATCCAAGCCAAAGTGCTAACGTTTTAGGTGAAGCTCAAAAACTTAATGACTCTCAAGCTCCAAAAGCTGATGCGCAACAAAATAACTTCAACAAAGATCAACAAAGCGCCTTCTATGAAATCTTGAACATGCCTAACTTAAACGAAGCGCAACGTAACGGCTTCATTCAAAGTCTTAAAGACGACCCAAGCCAAAGCACTAACGTTTTAGGTGAAGCTAAAAAATTAAACGAATCTCAAGCACCGAAAGCTGATAACAATTTCAACAAAGAACAACAAAATGCTTTCTATGAAATCTTGAATATGCCTAACTTAAACGAAGAACAACGCAATGGTTTCATCCAAAGCTTAAAAGATGACCCAAGCCAAAGTGCTAACCTATTGTCAGAAGCTAAAAAGTTAAATGAATCTCAAGCACCGAAAGCGGATAACAAATTCAACAAAGAACAACAAAATGCTTTCTATGAAATCTTACATTTACCTAACTTAAACGAAGAACAACGCAATGGTTTCATCCAAAGCCTAAAAGATGACCCAAGCCAAAGCGCTAACCTTTTAGCAGAAGCTAAAAAGCTAAATGATGCTCAAGCACCAAAAGCTGACAACAAATTCAACAAAGAACAACAAAATGCTTTCTATGAAATTTTACATTTACCTAACTTAACTGAAGAACAACGTAACGGCTTCATCCAAAGCCTTAAAGACGATCCTTCAGTGAGCAAAGAAATTTTAGCAGAAGCTAAAAAGCTAAACGATGCTCAAGCACCAAAA[[[]]]AAAGAAGATGGTAACGGAGTACATGTCGTTAAACCTGGTGATACAGTAAATGACATTGCAAAAGCAAACGGCACTACTGCTGACAAAATTGCTGCAGATAACAAATTAGCTGATAAAAACATGATCAAACCTGGTCAAGAACTTGTTGTTGATAAGAAGCAACCAGCAAACCATGCAGATGCTAACAAAGCTCAAGCATTACCAGAAACTGGTGAAGAAAATCCATTCATCGGTACAACTGTATTTGGTGGATTATCATTAGCCTTAGGTGCAGCGTTATTAGCTGGACGTCGTCGCGAACTATA
t032,t003,t002,t008,t011,t127,t034,t084,t012,t037,t021,t015,t019,t091,t044,t022,t437,t045,t005,t067,t024,t223,t189,t018,t304,t004,t026,t001,t065,t020,t014,t216,t571,t108,t230,t355,t267,t899,t148,t064,t346,t608,t078,t159,t160,t038,t701,t688,t190,t311,t242,t041,t379,t056,t030,t105,t3949,t050,t548,t740,t186,t1451,t359,t645,t264,t085,t209,t068,t166,t318,t657,t136,t172,t164,t122,t121,t010,t337,t515,t025,t330,t1476,t338,t211,t040,t1081,t843,t089,t852,t073,t051,t062,t149,t171,t790,t294,t314,t177,t334,t6057
\ No newline at end of file
name: BioPythonWorkbench
channels:
- conda-forge
- bioconda
- defaults
dependencies:
- biopython=1.71
- python=3.7
- regex
- openssl = 1.0
- samtools = 1.9
- seaborn = 0.9.0
- scipy = 1.2.1
- pysam = 0.15.2
- mpmath = 1.1.0
name: bwa
channels:
- defaults
- bioconda
dependencies:
- bwa=0.7.17
name: FastP
channels:
- bioconda
- defaults
dependencies:
- fastp=0.20.0
Fitness plot for the top 3 predictions on sample {{ snakemake.wildcards.id }} and a k of {{snakemake.params.k}} (dataset: {{snakemake.config.input_folder}})
K-mer coverage estimation histogram for {{ snakemake.wildcards.id }} and a k of {{snakemake.params.k}} (dataset: {{snakemake.config.input_folder}})
spa-type prediction overview for k-mer length: {{ snakemake.wildcards.kmer }}
Quality control report (fastp) for read with id: {{ snakemake.wildcards.id }}
rule
spades:
input:
read1 = 'data/auxiliary/' + config['input_folder'] + '/{id}' + '.qc' + config['input_read_1_ending'],
read2 = 'data/auxiliary/' + config['input_folder'] + '/{id}' + '.qc' + config['input_read_2_ending']
output:
directory('data/auxiliary/' + config['input_folder'] + '/{id}/spades')
singularity:
'docker://pegi3s/spades:latest'
shell:
'spades.py -1 {input.read1} -2 {input.read2} -o {output} --phred-offset 33' + (
(' --trusted-contigs ' + 'data/input/' + config['reference_genome'] + '/' + config['genome_file_identifier']) if
config['reference_guided_assembly'] else '')
rule
exactMatch:
input:
infolder = 'data/auxiliary/' + config['input_folder'] + '/{id}/spades'
output:
'data/output/' + config['input_folder'] + '/{id}/exactMatches.tsv'
params:
spaSeqs = 'data/auxiliary/spaSequences.fa',
scaffolds = 'data/auxiliary/' + config['input_folder'] + '/{id}/spades/scaffolds.fasta'
conda:
'../envs/biopythonworkbench.yaml'
script:
'../scripts/spaSequenceSearch.py'
rule estimateKmerCoverage:
input:
read1 = 'data/auxiliary/'+config['input_folder']+'/{id}'+'.qc'+config['input_read_1_ending'],
read2 = 'data/auxiliary/'+config['input_folder']+'/{id}'+'.qc'+config['input_read_2_ending']
output:
histogram = report('data/output/' + config['input_folder'] + '/kmers/{kmer}/{id}/kmers.histo.png',category='Coverage-Based Method',caption='../report/kmerCoverageHistogram.rst'),
histogramRaw = 'data/auxiliary/' + config['input_folder'] + '/kmers/{kmer}/{id}/kmers.histo.raw.png',
mean = 'data/auxiliary/' + config['input_folder'] + '/kmers/{kmer}/{id}/coverage_estimate_kmercountbased.txt',
kmererror = 'data/auxiliary/' + config['input_folder'] + '/kmers/{kmer}/{id}/kmer_error.txt'
params:
k = lambda wildcards: wildcards.kmer,
#cluster execution
cpus = '1', #TODO: This could in theory be sped up significantly using a shared cache and multithreading
gpus = '0',
mem = '64G',
walltime = '00:45:00'
log:
'logs/' + config['input_folder'] + '/kmers/{kmer}/{id}/estimateKmerCoverage.log'
benchmark:
'benchmarks/' + config['input_folder'] + '/kmers/{kmer}/{id}/estimateKmerCoverage.log'
conda:
'../envs/biopythonworkbench.yaml'
script:
'../scripts/estimateKmerCoverage.py'
'''
rule estimateKmerCoverageFiltered:
input:
reads = 'data/auxiliary/'+config['input_folder']+'/{id}/filteredReads.fastq'
output:
histogram = 'data/auxiliary/'+config['input_folder']+'/kmers/{kmer}/{id}/kmers.histo.regionXOnly.png'
params:
k = lambda wildcards: wildcards.kmer
#TODO: Threads = 2 ?
log:
'logs/'+config['input_folder']+'/kmers/{kmer}/{id}/estimateKmerCoverageFiltered.log'
conda:
'../envs/main.yaml'
script:
'../scripts/estimateKmerCoverageFiltered.py'
'''
rule estimateKmerCoverageAlignment:
input:
coverageEstimate = 'data/auxiliary/'+config['input_folder']+'/{id}/coverageEstimate.txt',
readLengthEstimate = 'data/auxiliary/'+config['input_folder']+'/{id}/readLengthEstimate.txt',
baseErrorEstimate = 'data/auxiliary/'+config['input_folder']+'/{id}/base_error_estimate.txt'
output:
kmerCoverage = 'data/auxiliary/'+config['input_folder']+'/kmers/{kmer}/{id}/coverage_estimate_alignmentbased.txt'
params:
k = lambda wildcards: wildcards.kmer,
# cluster execution
cpus = '1',
gpus = '0',
mem = '16G',
walltime = '00:30:30'
log:
'logs/'+config['input_folder']+'/kmers/{kmer}/{id}/estimateKmerCoverage.log'
conda:
'../envs/biopythonworkbench.yaml'
script:
'../scripts/estimateKmerCoverage_alignment.py'
rule estimateCoverageAlignment:
input:
filteredAlignment = 'data/auxiliary/'+config['input_folder']+'/{id}/alignment.sorted.bam'
output:
coverageEstimate = 'data/auxiliary/'+config['input_folder']+'/{id}/coverageEstimate.txt'
params:
# cluster execution
cpus = '1',
gpus = '0',
mem = '16G',
walltime = '00:30:30'
log:
'logs/'+config['input_folder']+'/{id}/estimateKmerCoverage_alignment.log'
conda:
'../envs/biopythonworkbench.yaml'
shell:
'samtools depth {input.filteredAlignment} | awk \' $1 == "maskref" {{sum+=$3}} END {{ print "Average = ",sum/NR}}\' | grep -Eo \'[+-]?[0-9]+([.][0-9]+)?\' > {output.coverageEstimate}'
rule calcPriorProbabilitiesCoverage:
input:
likelihoods = 'data/auxiliary/'+config['input_folder']+'/kmers/{kmer}/{id}/likelihoods_cov.json'
output:
priorFilePath = 'data/auxiliary/'+config['input_folder']+'/kmers/{kmer}/{id}/prior_cov.txt'
params:
k = lambda wildcards: wildcards.kmer,
dps = config['dps'],
# cluster execution
cpus = '1',
gpus = '0',
mem = '2G',
walltime = '00:05:30'
log:
'logs/'+config['input_folder']+'/probabilistic/kmers/{kmer}/{id}/calcPrior_cov.log'
conda:
'../envs/biopythonworkbench.yaml'
script:
'../scripts/calcPriorProbabilities.py'
rule calcProbabilitiesCoverage:
input:
likelihoods = 'data/auxiliary/'+config['input_folder']+'/kmers/{kmer}/{id}/likelihoods_cov.json',
prior = 'data/auxiliary/'+config['input_folder']+'/kmers/{kmer}/{id}/prior_cov.txt'
output:
probabilities = 'data/auxiliary/'+config['input_folder']+'/kmers/{kmer}/{id}/scores.probabilistic_cov.tsv'
params:
dps = config['dps'],
# cluster execution
cpus = '1',
gpus = '0',
mem = '8G',
walltime = '00:10:30'
conda:
'../envs/biopythonworkbench.yaml'
log:
'logs/'+config['input_folder']+'/probabilistic/kmers/{kmer}/{id}/probabilities_cov.log'
script:
'../scripts/calcSpaTypeProbabilities.py'
rule createFitnessPlots:
input:
counts = 'data/auxiliary/'+config['input_folder']+'/kmers/{kmer}/{id}/alignment.counts.json',
probabilities = 'data/auxiliary/'+config['input_folder']+'/kmers/{kmer}/{id}/scores.probabilistic_cov.tsv',
ratios = 'data/auxiliary/kmers/{kmer}/spaSequencesRatios.json'
output:
report('data/output/'+config['input_folder']+'/kmers/{kmer}/{id}_top3fit.svg',category='Coverage-Based Method',caption='../report/fitnessPlot.rst')
params:
dps = config['dps'],
# cluster execution
cpus = '1',
gpus = '0',
mem = '1G',
walltime = '00:20:30'
conda:
'../envs/biopythonworkbench.yaml'
script:
'../scripts/createFitnessPlots.py'
rule calcExpectedCounts:
input:
kmerCoverageEstimate = determineKmerCoverageEstimateFile(),
counts = 'data/auxiliary/kmers/{kmer}/spaSequences.counts.json'
output:
'data/auxiliary/'+config['input_folder']+'/kmers/{kmer}/{id}/expected_counts.json'
params:
k = lambda wildcards: wildcards.kmer,
# cluster execution
cpus = '1',
mem = '8G',
gpus = '0',
walltime = '00:30:00'
conda:
'../envs/biopythonworkbench.yaml'
script:
'../scripts/calcExpectedCounts.py'
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment