Commit 49b482e9 authored by Jan Hoeckesfeld's avatar Jan Hoeckesfeld
Browse files

Merge branch 'master' into iterationset-tests

parents d718b2dd 8d50dc49
...@@ -28,16 +28,16 @@ assembly_model : False ...@@ -28,16 +28,16 @@ assembly_model : False
###Reference Genome ###Reference Genome
reference_genome : NCTC8325 reference_genome : NCTC8325
genome_file_identifier : genome.fa #NCTC8325 genome_file_identifier : genome.fa #NCTC8325
#genome_file_identifier : genome.fna # genome_file_identifier : genome.fna # Found here: https://www.ncbi.nlm.nih.gov/assembly/GCF_900475245.1
protein_table_identifier : protein.table protein_table_identifier : protein.table # "feature table"
reference_guided_assembly : False reference_guided_assembly : False
# reference_genome_table refers to protein_table!
reference_genome_table_index_organism_id : 6 #NCTC8325 reference_genome_table_index_organism_id : 6 #NCTC8325
reference_genome_table_index_start_pos : 7 #H-EMRSA-15 #NCTC8325 reference_genome_table_index_start_pos : 7 #H-EMRSA-15 #NCTC8325
reference_genome_table_index_end_pos : 8 #H-EMRSA-15 #NCTC8325 reference_genome_table_index_end_pos : 8 #H-EMRSA-15 #NCTC8325
reference_genome_table_index_strand : 9 #H-EMRSA-15 #NCTC8325 reference_genome_table_index_strand : 9 #H-EMRSA-15 #NCTC8325 # + or -
reference_genome_table_index_protein_id : 13 #H-EMRSA-15 #NCTC8325 reference_genome_table_index_protein_id : 13 #H-EMRSA-15 #NCTC8325 # where the protein_a_identifier is located!
protein_a_identifier : protein A #NCTC8325 protein_a_identifier : protein A #NCTC8325
l_lactate_permease_identifier : L-lactate permease #NCTC8325 l_lactate_permease_identifier : L-lactate permease #NCTC8325
...@@ -81,7 +81,7 @@ dps : 100 ...@@ -81,7 +81,7 @@ dps : 100
#countMean -> k-mer coverage is based on a k-mer frequency histogram where the average k-mer coverage is estimated by the mean of k-mer frequencies (again, cutting off error k-mers) #countMean -> k-mer coverage is based on a k-mer frequency histogram where the average k-mer coverage is estimated by the mean of k-mer frequencies (again, cutting off error k-mers)
kmerCoverageEstimationMethod : countPoisson kmerCoverageEstimationMethod : countPoisson
#when calclating probabilities based on COV method, skip a tpye as soon as the deviation between expected and observed exceeds (deviationCutoff*kmerCoverageEstimate) for any k-mer #when calculating probabilities based on COV method, skip a type as soon as the deviation between expected and observed exceeds (deviationCutoff*kmerCoverageEstimate) for any k-mer
deviationCutoff : 2.5 deviationCutoff : 2.5
#likelihoods exceeding this value are not taken into account when calculating a prior #likelihoods exceeding this value are not taken into account when calculating a prior
#likelihoodCutoff : -50000 #likelihoodCutoff : -50000
......
rule rule spades:
spades: input:
input: read1 = 'data/auxiliary/' + config['input_folder'] + '/{id}' + '.qc' + config['input_read_1_ending'],
read1 = 'data/auxiliary/' + config['input_folder'] + '/{id}' + '.qc' + config['input_read_1_ending'], read2 = 'data/auxiliary/' + config['input_folder'] + '/{id}' + '.qc' + config['input_read_2_ending']
read2 = 'data/auxiliary/' + config['input_folder'] + '/{id}' + '.qc' + config['input_read_2_ending'] output:
output: directory('data/auxiliary/' + config['input_folder'] + '/{id}/spades')
directory('data/auxiliary/' + config['input_folder'] + '/{id}/spades') singularity:
singularity: 'docker://pegi3s/spades:latest'
'docker://pegi3s/spades:latest' shell:
shell: 'spades.py -1 {input.read1} -2 {input.read2} -o {output} --phred-offset 33' + (
'spades.py -1 {input.read1} -2 {input.read2} -o {output} --phred-offset 33' + ( (' --trusted-contigs ' + 'data/input/' + config['reference_genome'] + '/' + config['genome_file_identifier']) if
(' --trusted-contigs ' + 'data/input/' + config['reference_genome'] + '/' + config['genome_file_identifier']) if config['reference_guided_assembly'] else '')
config['reference_guided_assembly'] else '')
rule rule exactMatch:
exactMatch: input:
input: infolder = 'data/auxiliary/' + config['input_folder'] + '/{id}/spades'
infolder = 'data/auxiliary/' + config['input_folder'] + '/{id}/spades' output:
output: 'data/output/' + config['input_folder'] + '/{id}/exactMatches.tsv'
'data/output/' + config['input_folder'] + '/{id}/exactMatches.tsv' params:
params: spaSeqs = 'data/auxiliary/spaSequences.fa',
spaSeqs = 'data/auxiliary/spaSequences.fa', scaffolds = 'data/auxiliary/' + config['input_folder'] + '/{id}/spades/scaffolds.fasta'
scaffolds = 'data/auxiliary/' + config['input_folder'] + '/{id}/spades/scaffolds.fasta' conda:
conda: '../envs/biopythonworkbench.yaml'
'../envs/biopythonworkbench.yaml' script:
script: '../scripts/spaSequenceSearch.py'
'../scripts/spaSequenceSearch.py'
from Bio import SeqIO from Bio import SeqIO
import re
regSpaTypes = re.compile(r'((t[0-9]+,)([0-9][0-9])+(-[0-9])*)')
#Load repeat sequences in fasta format #Load repeat sequences in fasta format
repeats = SeqIO.parse(snakemake.input['repeats'],'fasta') repeats = SeqIO.parse(snakemake.input['repeats'],'fasta')
...@@ -20,18 +22,21 @@ with open(snakemake.input['types'],'r') as infile, open(snakemake.output['out'], ...@@ -20,18 +22,21 @@ with open(snakemake.input['types'],'r') as infile, open(snakemake.output['out'],
spaTypes = infile.read().splitlines() spaTypes = infile.read().splitlines()
spaTypesExtended = [] spaTypesExtended = []
for spaType in spaTypes: for line, spaType in enumerate(spaTypes):
if not regSpaTypes.match(spaType):
print("spaType in spatypes" + " does not fit into the pattern: " + spaType + "in line " + str(line+1))
split = spaType.split(',') split = spaType.split(',')
name = split[0] name = split[0]
if len(filterList) > 0: if len(filterList) > 0:
if not name in filterList: if not name in filterList:
continue continue
value = split[1] value = split[1]
#print(name,value)
sptRepeats = value.split('-') sptRepeats = value.split('-')
if not sptRepeats: if not sptRepeats:
continue continue
sequence = '' sequence = ''
for repeat in sptRepeats: for repeat in sptRepeats:
if repeat not in repeatsDict:
print("Unrecoginized repeat in spatype " + repeat + ", found in line " + str(line+1))
sequence += repeatsDict[repeat] sequence += repeatsDict[repeat]
outfile.write('>'+name+'\n'+sequence+'\n') outfile.write('>'+name+'\n'+sequence+'\n')
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment