Commit 49b482e9 authored by Jan Hoeckesfeld's avatar Jan Hoeckesfeld
Browse files

Merge branch 'master' into iterationset-tests

parents d718b2dd 8d50dc49
......@@ -28,16 +28,16 @@ assembly_model : False
###Reference Genome
reference_genome : NCTC8325
genome_file_identifier : genome.fa #NCTC8325
#genome_file_identifier : genome.fna
protein_table_identifier : protein.table
# genome_file_identifier : genome.fna # Found here: https://www.ncbi.nlm.nih.gov/assembly/GCF_900475245.1
protein_table_identifier : protein.table # "feature table"
reference_guided_assembly : False
# reference_genome_table refers to protein_table!
reference_genome_table_index_organism_id : 6 #NCTC8325
reference_genome_table_index_start_pos : 7 #H-EMRSA-15 #NCTC8325
reference_genome_table_index_end_pos : 8 #H-EMRSA-15 #NCTC8325
reference_genome_table_index_strand : 9 #H-EMRSA-15 #NCTC8325
reference_genome_table_index_protein_id : 13 #H-EMRSA-15 #NCTC8325
reference_genome_table_index_strand : 9 #H-EMRSA-15 #NCTC8325 # + or -
reference_genome_table_index_protein_id : 13 #H-EMRSA-15 #NCTC8325 # where the protein_a_identifier is located!
protein_a_identifier : protein A #NCTC8325
l_lactate_permease_identifier : L-lactate permease #NCTC8325
......@@ -81,7 +81,7 @@ dps : 100
#countMean -> k-mer coverage is based on a k-mer frequency histogram where the average k-mer coverage is estimated by the mean of k-mer frequencies (again, cutting off error k-mers)
kmerCoverageEstimationMethod : countPoisson
#when calclating probabilities based on COV method, skip a tpye as soon as the deviation between expected and observed exceeds (deviationCutoff*kmerCoverageEstimate) for any k-mer
#when calculating probabilities based on COV method, skip a type as soon as the deviation between expected and observed exceeds (deviationCutoff*kmerCoverageEstimate) for any k-mer
deviationCutoff : 2.5
#likelihoods exceeding this value are not taken into account when calculating a prior
#likelihoodCutoff : -50000
......
rule
spades:
input:
read1 = 'data/auxiliary/' + config['input_folder'] + '/{id}' + '.qc' + config['input_read_1_ending'],
read2 = 'data/auxiliary/' + config['input_folder'] + '/{id}' + '.qc' + config['input_read_2_ending']
output:
directory('data/auxiliary/' + config['input_folder'] + '/{id}/spades')
singularity:
'docker://pegi3s/spades:latest'
shell:
'spades.py -1 {input.read1} -2 {input.read2} -o {output} --phred-offset 33' + (
(' --trusted-contigs ' + 'data/input/' + config['reference_genome'] + '/' + config['genome_file_identifier']) if
config['reference_guided_assembly'] else '')
rule spades:
input:
read1 = 'data/auxiliary/' + config['input_folder'] + '/{id}' + '.qc' + config['input_read_1_ending'],
read2 = 'data/auxiliary/' + config['input_folder'] + '/{id}' + '.qc' + config['input_read_2_ending']
output:
directory('data/auxiliary/' + config['input_folder'] + '/{id}/spades')
singularity:
'docker://pegi3s/spades:latest'
shell:
'spades.py -1 {input.read1} -2 {input.read2} -o {output} --phred-offset 33' + (
(' --trusted-contigs ' + 'data/input/' + config['reference_genome'] + '/' + config['genome_file_identifier']) if
config['reference_guided_assembly'] else '')
rule
exactMatch:
input:
infolder = 'data/auxiliary/' + config['input_folder'] + '/{id}/spades'
output:
'data/output/' + config['input_folder'] + '/{id}/exactMatches.tsv'
params:
spaSeqs = 'data/auxiliary/spaSequences.fa',
scaffolds = 'data/auxiliary/' + config['input_folder'] + '/{id}/spades/scaffolds.fasta'
conda:
'../envs/biopythonworkbench.yaml'
script:
'../scripts/spaSequenceSearch.py'
rule exactMatch:
input:
infolder = 'data/auxiliary/' + config['input_folder'] + '/{id}/spades'
output:
'data/output/' + config['input_folder'] + '/{id}/exactMatches.tsv'
params:
spaSeqs = 'data/auxiliary/spaSequences.fa',
scaffolds = 'data/auxiliary/' + config['input_folder'] + '/{id}/spades/scaffolds.fasta'
conda:
'../envs/biopythonworkbench.yaml'
script:
'../scripts/spaSequenceSearch.py'
from Bio import SeqIO
import re
regSpaTypes = re.compile(r'((t[0-9]+,)([0-9][0-9])+(-[0-9])*)')
#Load repeat sequences in fasta format
repeats = SeqIO.parse(snakemake.input['repeats'],'fasta')
......@@ -20,18 +22,21 @@ with open(snakemake.input['types'],'r') as infile, open(snakemake.output['out'],
spaTypes = infile.read().splitlines()
spaTypesExtended = []
for spaType in spaTypes:
for line, spaType in enumerate(spaTypes):
if not regSpaTypes.match(spaType):
print("spaType in spatypes" + " does not fit into the pattern: " + spaType + "in line " + str(line+1))
split = spaType.split(',')
name = split[0]
if len(filterList) > 0:
if not name in filterList:
continue
value = split[1]
#print(name,value)
sptRepeats = value.split('-')
if not sptRepeats:
continue
sequence = ''
for repeat in sptRepeats:
if repeat not in repeatsDict:
print("Unrecoginized repeat in spatype " + repeat + ", found in line " + str(line+1))
sequence += repeatsDict[repeat]
outfile.write('>'+name+'\n'+sequence+'\n')
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment