Commit c7246bda authored by Philipp Spohr's avatar Philipp Spohr
Browse files

Added software

parent 7ffe33b6
import json
import re
import math
from Bio import SeqIO
from Bio.Alphabet import generic_dna
from Bio.Seq import Seq
import sys
from shared import *
import scipy
import scipy.signal
import pysam
import logging
logging.basicConfig(filename=snakemake.log[0], level=logging.DEBUG,format="%(asctime)s:%(levelname)s:%(message)s")
from scipy.stats import poisson
from collections import Counter
#########################################################################################
k = int(snakemake.params['k'])
kmers = {}
f= open(snakemake.output['debug'],"w+")
f.write('RUN\n')
#Parse Meta Data
meta = {}
with open(snakemake.input['regions'],'r') as infile:
for l in infile.read().splitlines():
data = l.split('\t')
meta[data[0]]=(int(data[1]),int(data[2]))
samfile = pysam.AlignmentFile(snakemake.input['alignment'])
readIDs = {}
for read in samfile.fetch():
readID = read.query_name
if readID not in readIDs:
readIDs[readID] = 1
else:
logging.critical('{}: Read got processed twice!'.format(readID))
#print("creating kmer profile for read: " + readID)
#print(bitFlags)
if read.is_secondary or read.is_unmapped:
logging.info('{}: Secondary (or unmapped) Alignment ... skipping'.format(readID))
continue
reference_start = read.reference_start
reference_end = read.reference_end
#qstart = read.query_alignment_start
#qend = read.query_alignment_end
region_start,region_end = meta[read.reference_name]
isReversed = containsFlag(read.flag,4)
startCutoff = (region_start - reference_start) if region_start > reference_start else 0
endCutoff = (reference_end -region_end) if reference_end > region_end else 0
#startCutoff = region_start - qstart if region_start > qstart else 0
#endCutoff = qned -region_end if qend > region_end else 0
#print(region_start,region_end,reference_start,reference_end)
f.write('Reference: {} (RegionBorders: {}/{})\n'.format(read.reference_name,region_start,region_end))
f.write('Cutaways [{}:{}]: (Alignment Reference Start/End {}/{})\n'.format(startCutoff,endCutoff,reference_start,reference_end))
f.write('RawSequence: {}\n'.format(read.query_alignment_sequence))
sequence = read.query_alignment_sequence
if endCutoff > 0:
sequence = sequence[:-endCutoff]
if startCutoff > 0:
sequence = sequence[startCutoff:]
f.write('CutSequence: {}\n'.format(sequence[startCutoff:-endCutoff]))
if isReversed: #Reverse
sequence = Seq(sequence).reverse_complement()
parseKmers(kmers,sequence,k) #see shared.py
#Normalize Vector
normalized_kmers = normalizeKmers(kmers)
with open(snakemake.output['profile'],'w') as outfile:
json.dump(normalized_kmers,outfile)
with open(snakemake.output['counts'],'w') as outfile:
json.dump(kmers,outfile)
import json
import re
import math
from Bio import SeqIO
from Bio.Alphabet import generic_dna
from Bio.Seq import Seq
from shared import *
#Creates k-mer profile from a set of sequence,
spaSequences = SeqIO.parse(snakemake.input['sequences'],'fasta')
k = int(snakemake.params['k'])
profiles = {}
counts = {}
for sequence in spaSequences:
#print("creating kmer profile for spa type: " + str(sequence.id))
kmers = {}
parseKmers(kmers,sequence.seq,k)
parseKmers(kmers,sequence.seq.reverse_complement(),k) #also note rev comp kmers
counts[sequence.id] = kmers
profiles[sequence.id] = normalizeKmers(kmers)
with open(snakemake.output['profiles'],'w') as outfile:
json.dump(profiles,outfile)
with open(snakemake.output['counts'],'w') as outfile:
json.dump(counts,outfile)
from Bio.Seq import Seq
from Bio.Alphabet import generic_dna
from Bio import SeqIO
from itertools import chain
import json
from shared import *
spaSeqs = json.load(open(snakemake.input['expectedCounts'],'r'))
reads1 = SeqIO.parse(snakemake.input['read1'],'fastq')
reads2 = SeqIO.parse(snakemake.input['read2'],'fastq')
k = int(snakemake.params['k'])
### Step 1: Identify the set of valid k-mers (k-mers that occur in any spa-type)
validKmers = {}
for x in spaSeqs:
for kmer in spaSeqs[x]:
validKmers[kmer] = 0
### Step 2: Count valid k-mers
stack = chain(reads1,reads2)
for read in stack:
sequence = read.seq
for i in range(len(sequence)+1-k):
kmer = str(sequence[i:i+k])
if kmer in validKmers:
validKmers[kmer] += 1
### Step 3: Normalize
normalizedKmers = normalizeKmers(validKmers)
### Step 4: Generate output
with open(snakemake.output['counts'],'w') as outfile:
json.dump(validKmers,outfile)
with open(snakemake.output['profile'],'w') as outfile:
json.dump(normalizedKmers,outfile)
from Bio import SeqIO
from itertools import chain
from shared import *
import json
k = int(snakemake.params['k'])
#Read the meta data (in order to cut reads appopriately)
regionXBorders = (-1,-1)
with open(snakemake.input['regionXMetaData'],'r') as metaFile:
data = metaFile.read().split('\t')
regionXBorders=(int(data[0]),int(data[1]))
regionXStart = regionXBorders[0]
regionXEnd = regionXBorders[1]
#Read the correct reads (did really originate from region x)
filteredReads = SeqIO.parse(snakemake.input['filteredReads'],'fastq')
kmers = {}
kmerOrigins = {}
for read in filteredReads:
flags = {}
startPosRead1,startPosRead2,r1strand,r2strand = parseDwgsimReadName(read.id)
readLength = len(read)
relevantSequence = read.seq
readStart = -1
readStrand = -1
if (read.name.endswith('/1')):
readStart = startPosRead1
readStrand = r1strand
elif(read.name.endswith('/2')):
readStart = startPosRead2
readStrand = r2strand
#Detect Read Orientation
readOrientation = -1
if startPosRead2 < startPosRead1:
readOrientation = 1 if read.name.endswith('/1') else 0 #reverse
else:
readOrientation = 0 if read.name.endswith('/1') else 1
flags['readStart'] = readStart
flags['readStrand'] = readStrand
flags['readOrientation'] = readOrientation
readStart -= 1
readEnd = readStart + readLength
if regionXStart > readStart:
clippingStart = regionXStart-readStart
if readStrand == 0:
relevantSequence = relevantSequence[clippingStart:]
else: # read Strand 1
relevantSequence = relevantSequence[:-clippingStart]
flags['trimStart'] = clippingStart
if readEnd > regionXEnd:
clippingEnd = readEnd-regionXEnd
if readStrand == 0:
relevantSequence = relevantSequence[:-clippingEnd]
else:
relevantSequence = relevantSequence[clippingEnd:]
flags['trimEnd'] = clippingEnd
parseKmers(kmers,relevantSequence,k,kmerOrigins,read.name,flags)
kmerOriginsAnalysis = {}
#Post Processing: Analyze total fraction of reversals and complements
for kmer in kmerOrigins:
total = 0
rv = 0
cp = 0
ts = 0
te = 0
rs1 = 0
ro1 = 0
for read in kmerOrigins[kmer]: #Make more readable
flags = read[2]
total += 1
if 'complement' in flags:
cp += 1
if 'reverse' in flags:
rv += 1
if 'trimStart' in flags:
ts += 1
if 'trimEnd' in flags:
te += 1
if flags['readStrand'] == 1:
rs1 += 1
if flags['readOrientation'] == 1:
ro1 += 1
kmerOriginsAnalysis[kmer] = {
'total' : total ,
'reversedKmers' : rv,
'complementedKmers' : cp,
'startTrimmed' : ts,
'endTrimmed' : te,
'readStrand1' : rs1,
'readOrientation1' : ro1,
'origins' : kmerOrigins[kmer]
}
with open(snakemake.output['counts'],'w') as outfile:
json.dump(kmers,outfile)
with open(snakemake.output['origins'],'w') as outfile:
json.dump(kmerOriginsAnalysis,outfile)
import os.path
from Bio import SeqIO
from Bio.Alphabet import generic_dna
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
INDEX_ORGANISM_ID = snakemake.config['reference_genome_table_index_organism_id']
INDEX_START_POS = snakemake.config['reference_genome_table_index_start_pos']
INDEX_END_POS = snakemake.config['reference_genome_table_index_end_pos']
INDEX_PROTEIN_NAME = snakemake.config['reference_genome_table_index_protein_id']
def main():
extractSeq(
snakemake.input['pt'],
snakemake.params['pid'],
snakemake.output['main'],
snakemake.input['refg']
)
def extractSeq(pt,pid,out,fna):
#Check if protein id exists and retrieve information from protein table
with open(pt,'r') as tablefile:
lines = tablefile.read().splitlines()[1:] #Skip first line -> Contains only headers
for line in lines:
#Retrieve the fields that are relevant
entries = line.split('\t')
organism = entries[INDEX_ORGANISM_ID]
startPos = int(entries[INDEX_START_POS])
endPos = int(entries[INDEX_END_POS])
name = entries[INDEX_PROTEIN_NAME]
if name == pid:
record = SeqIO.read(fna, "fasta")
fillerLength = endPos+1-startPos
subsequencePrior = record.seq[0:startPos]
subsequenceMasked = Seq('N'*fillerLength,generic_dna)
subsequencePosterior = record.seq[endPos+1:]
masked = SeqRecord(subsequencePrior+subsequenceMasked+subsequencePosterior,name='Masked Reference Genome',id='maskref',description='Masked Reference Genome (Protein A replaced with N)')
SeqIO.write(masked,out,'fasta')
return
print("Did not find the entry: {} in the protein table ...".format(pid))
if __name__ == "__main__":
main()
import sys
import logging
logging.basicConfig(filename=snakemake.log[0], level=logging.DEBUG,format="%(asctime)s:%(levelname)s:%(message)s")
groundTruth = {}
with open(snakemake.input['groundTruth'],'r') as groundTruthFile:
lines = groundTruthFile.read().splitlines()
for l in lines:
d = l.split('\t')
groundTruth[d[0]] = d[1]
def metaSummarize(input,output):
with open(output,'w') as outFile:
for f in input:
hitCount = 0
with open(f,'r') as inFile:
data = inFile.read().splitlines()
for line in data:
readID = line.split('\t')[0]
predictedTypeID = line.split('\t')[1]
if not readID in groundTruth:
logging.critical('ReadID: {} is not part of the ground truth provided! Terminating ...'.format(readID))
print(groundTruth)
sys.exit(-1)
hitCount += 1 if predictedTypeID == groundTruth[readID] else 0
outFile.write(f+'\t'+str(hitCount)+'\n')
metaSummarize(snakemake.input['summary'],snakemake.output['meta'])
import mpmath as mp
import logging
#define memoize function to improve performance TODO: Evaluate if this is indeed a speedup in terms of performance
memopower = mp.memoize(mp.power)
memolog1p = mp.memoize(mp.log1p)
memoexp = mp.memoize(mp.exp)
memolog = mp.memoize(mp.log)
'''
def calculateKmerLikelihood(observedCounts,expectedCounts,kmerError,k,validKmers):
#We use a mpmath float to have a sufficiently precise numerical representation
probability = mp.mpf(0)
#Split probability for analysis purposes
unexpectedProbability = mp.mpf(0)
#Calculate default value for expected counts
sharedKmers = set(observedCounts.keys()).difference(set(expectedCounts.keys()))
expectedDefaultValue = kmerError*sum(observedCounts.values())/len(sharedKmers)
#print("Expected Count for Zeros:"+str(expectedDefaultValue))
for kmerID in validKmers:
#If we didn't observe a kmer at all we assume we observed it 0 times
observedCount = 0
expectedCount = expectedDefaultValue
#Otherwise we have counted before how many times we observed it
if kmerID in observedCounts:
observedCount = observedCounts[kmerID]
if kmerID in expectedCounts:
expectedCount = expectedCounts[kmerID]
#Use log10 to calculate probabilities
local_p = poisson.logpmf(observedCount,expectedCount)
#local_p = mp.log(mp.exp(-expectedCount)*(mp.mpf(expectedCount)**mp.mpf(observedCount))/(mp.fac(observedCount)))
#print(local_p)
if (local_p == 0):
logging.warning('Warning ...! local_p has reached {}'.format(local_p))
logging.warning('kmerID:{} observed:{} expected: {}'.format(kmerID,observedCount,expectedCount))
sys.exit(-1)
else:
try:
probability += local_p
if not (kmerID in expectedCounts):
unexpectedProbability += local_p
except ValueError as e:
logging.warning('kmerID:{} observed:{} expected: {}'.format(kmerID,observedCount,expectedCount))
logging.warning('calculated (raw) probability of {}\%'.format(local_p))
logging.critical(str(e))
sys.exit(-1)
if (probability == 0):
logging.critical('Warning: Total log probability has reached 0! float insufficient?')
sys.exit(-1)
return probability, unexpectedProbability
#TODO: Move to shared as this is prooooobably not a very probabilistic thing
def hamming_distance(seq1,seq2):
#is not called very often due to memoization but maybe there are more efficient ways of doing this (quick bit operation)
return sum(1 if x != y else 0 for x,y in zip(seq1,seq2))
USE_OWN_MEMOIZATION = True #Uses additional memoization besides mpmath funtionality
MEMOIZE_ON_PROCESS_LEVEL = True #Only uses a cache on a process level and not a shared cache between processes (avoids locks)
#Alternative Model Alex proposed ... let's see how well it does
def calculateKmerLikelihood_Generative(observedCounts,sequenceKmerProfiles,baseErrorRate,k,hdLikelihoods,likelihoods):
#don't use global caches to avoid locks, just cache locally
if MEMOIZE_ON_PROCESS_LEVEL:
hdLikelihoods = {}
likelihoods = {}
#Count how often kmers occur
nrOfSequenceKmers = sum(sequenceKmerProfiles[kmer] for kmer in sequenceKmerProfiles)
#Assume uniform distribution for drawing generator kmers
chanceOfDrawingKmer = 1/nrOfSequenceKmers
#print("Chance of drawing kmer: {}".format(chanceOfDrawingKmer))
#We start with a log probability of 0 -> Actual probability is 1 and then factor in
probability = mp.mpf(1)
#currentPos = 0
#total = len(observedCounts)
for kmerIDObs in observedCounts:
#print("Processing kmer {}/{} ...".format(currentPos,total))
#We start with a log probability of None -> Actual probability is 0
#obsProbability = None
obsProbability = 0
factorObs = observedCounts[kmerIDObs]
for kmerIDGen in sequenceKmerProfiles:
#Let's see how many times the kmer is contained in the sequence
factor = sequenceKmerProfiles[kmerIDGen]
hd = hamming_distance(kmerIDObs,kmerIDGen)
calculateHdLikelihood = True
hdLikelihood = None
if USE_OWN_MEMOIZATION:
if hd in hdLikelihoods:
#Maybe we already calculated this?
calculateHdLikelihood = False
hdLikelihood = hdLikelihoods[hd]
if calculateHdLikelihood:
#print("Calculating Likelihood for hd:{} for the first time ...".format(hd))
#print((1-baseErrorRate),k,hd,(k-hd))
intactBasesProbability = memopower((1-baseErrorRate),(k-hd))
alteredBasesProbability = memopower(baseErrorRate,hd)
hdLikelihood = intactBasesProbability * alteredBasesProbability
if USE_OWN_MEMOIZATION:
hdLikelihoods[hd] = hdLikelihood
#print("It is: {} ({}/{})".format(hdLikelihoods[hd],intactBasesProbability,alteredBasesProbability))
obsProbability += factor*hdLikelihood
obsProbability *= chanceOfDrawingKmer
obsProbability = memopower(obsProbability,factorObs)
if (mp.isnan(obsProbability)):
raise ValueError("Probability is not a number: {}!".format(obsProbability))
probability *= obsProbability
if (mp.isnan(probability)):
raise ValueError("LogProbability is not a number: {}!".format(probability))
return probability
'''
def logSpaceAdd(update,before):
'''
Assume a >= b
We want to calculate ln(exp(ln(a))+exp(ln(b))), thus
ln(
exp(ln(b))*(1+exp(ln(a)-ln(b)))
)
->
ln(b) + ln(1+exp(ln(a)-ln(b)))
->
ln(b) + ln1p(exp(ln(a)-ln(b)))
'''
#As there is no neutral element of addition in log space we only start adding when two values are given
if before == None:
return update
else:
a = None
b = None
#Check which value is larger
if update > before:
b = update
a = before
else:
b = before
a = update
x = mp.mpf(a)-mp.mpf(b)
#print(update,before)
xexp = memoexp(x)
#print('Exp:',xexp)
val = memolog1p(xexp)
#print('Log:',val)
val = val + b
if val == 0:
print('a:{} b:{} x:{} xexp:{} log1p:{}'.format(a,b,x,xexp,val))
raise ValueError('LogSpace Addition has resulted in a value of 0: a = {} b = {} (possible underflow)'.format(a,b))
#elif val > 0:
#print('a:{} b:{} x:{} xexp:{} log1p:{}'.format(a,b,x,xexp,val))
#raise ValueError('Prior Update has resulted in a value > 0: a = {} b = {} val = {}'.format(a,b,val))