Commit c7246bda authored by Philipp Spohr's avatar Philipp Spohr
Browse files

Added software

parent 7ffe33b6
import json
trueCounts = json.load(open(snakemake.input['trueCounts'],'r'))
observedCounts = json.load(open(snakemake.input['observedCounts'],'r'))
keys = set().union(trueCounts.keys(),observedCounts.keys())
with open(snakemake.output['differences'],'w') as outfile:
for x in keys:
outfile.write(x+'\t')
if x in observedCounts and x in trueCounts:
outfile.write(str(observedCounts[x]-trueCounts[x]))
elif x in observedCounts:
outfile.write(str(observedCounts[x]))
elif x in trueCounts:
outfile.write(str(0-trueCounts[x]))
outfile.write('\n')
import json
spaType = 'unknown'
with open(snakemake.input['groundTruthFile'],'r') as infile:
data = infile.read().splitlines()
for line in data:
lineData = line.split('\t')
if lineData[0] == snakemake.params['inputFileID']:
spaType = lineData[1]
break
trueCounts = json.load(open(snakemake.input['trueCounts'],'r'))
expCounts = json.load(open(snakemake.input['expectedCounts'],'r'))[spaType]
diff = {}
keys = expCounts.keys()
for x in keys:
if x in expCounts and x in trueCounts:
diff[x]=(expCounts[x],trueCounts[x])
elif x in expCounts:
diff[x]=(expCounts[x],0)
elif x in trueCounts:
diff[x]=(0,trueCounts[x])
with open(snakemake.output['differences'],'w') as outfile:
totalDiffShared = 0
totalKmerCount = 0
outfile.write('[ Shared Kmers ] \n')
keys = set.intersection(set(expCounts.keys()),set(trueCounts.keys()))
for x in sorted(keys):
outfile.write('{} -> Expected: {} TrueReads: {} Ratio : {}\n'.format(x,expCounts[x],trueCounts[x],round(expCounts[x]/trueCounts[x],2)))
totalDiffShared += (trueCounts[x]-expCounts[x])
totalKmerCount += trueCounts[x]
totalDiffUnobserved = 0
outfile.write('[ Unobserved Kmers ] \n')
keys = set(expCounts.keys()).difference(set(trueCounts.keys()))
for x in sorted(keys):
outfile.write('{} -> Expected: {} \n'.format(x,expCounts[x]))
totalDiffUnobserved -= expCounts[x]
totalDiffUnexpected = 0
outfile.write('[ Unexpected Kmers ] \n')
keys = set(trueCounts.keys()).difference(set(expCounts.keys()))
for x in sorted(keys):
outfile.write('{} -> TrueReads: {} \n'.format(x,trueCounts[x]))
totalDiffUnexpected += trueCounts[x]
totalKmerCount += trueCounts[x]
outfile.write('[ Stats ] \n')
outfile.write('Total Amount of Observed Kmers: {}\n'.format(totalKmerCount))
outfile.write('Total Diff Shared Kmers: {}\n'.format(totalDiffShared))
outfile.write('Total Diff Unobserved Kmers: {}\n'.format(totalDiffUnobserved))
outfile.write('Total Diff Unexpected Kmers: {}\n'.format(totalDiffUnexpected))
import seaborn as sns
import matplotlib.pyplot as plt
x = []
y = []
coverageData = {}
counts = {}
new_rc_params = {'text.usetex': False,
"svg.fonttype": 'none'
}
plt.rcParams.update(new_rc_params)
for f in snakemake.input:
#print(f)
fileid = f.split('/')[-2]
metafile = snakemake.params['metaFiles'][f]
data = open(metafile,'r').read().split('\t')
start = int(data[0])
end = int(data[1])
#print(start,end)
pileup = open(f,'r').read().splitlines()
for l in pileup:
data = l.split('\t')
pos = int(data[1])
coverage = int(data[3])
if start <= pos < end:
x.append(fileid)
y.append(coverage)
if not pos in coverageData:
coverageData[pos] = 0
if not pos in counts:
counts[pos] = 0
coverageData[pos] += coverage
counts[pos] += 1
plt.figure(figsize=(16,8))
sns.boxplot(x,y,order=sorted(list(set(x))))
plt.ylabel('\\footnotesize{coverage}')
plt.savefig(snakemake.output[0])
x = []
y = []
for p in coverageData:
x.append(p)
y.append(coverageData[p]/counts[p])
plt.clf()
plt.figure(figsize=(10,6))
plt.plot(x,y)
plt.xlabel('\\footnotesize{position}')
plt.xticks(rotation=90)
plt.ylabel('\\footnotesize{coverage}')
plt.savefig(snakemake.output[1])
import json
from scipy.optimize import curve_fit
import matplotlib.pyplot as plt
import numpy as np
from scipy.signal import argrelextrema
from scipy.stats import poisson
observedKmers = {}
with open(snakemake.input['counts'],'r') as infile:
observedKmers = json.load(infile)
rawCounts = [x for x in observedKmers.values()]
b=np.amax(rawCounts)
hist,_ = np.histogram(rawCounts,bins=b)
local_mins = argrelextrema(hist,np.less)[0] #choosing local min here, see DOI: 10.1093/bib/bbv029
threshold = local_mins[0]
vals = [x for x in observedKmers.values() if x > threshold]
b = np.arange(np.amax(vals)+1)
b_centers = np.arange(np.max(vals))+0.5
b_reduced = np.arange(np.max(vals))
plt.rcParams['figure.figsize'] = [10, 5]
hist,bin_edges,patches = plt.hist(vals,bins=b)
ratios = json.load(open(snakemake.input['ratios'],'r'))
filteredSpaTypes = []
with open(snakemake.input['probabilities'],'r') as infile:
lines = infile.read().splitlines()[:3]
for l in lines:
filteredSpaTypes.append(l.split('\t')[0])
scores = {}
maxKmerCountX = np.argmax(hist)
maxKmerCountY = np.amax(hist)
for spaType in filteredSpaTypes:
spaTypeRatios = ratios[spaType]
def multiPeakPoisson(x,firstPeak,peakHeight):
return sum( poisson.pmf(x,firstPeak*(i+1))*spaTypeRatios[i]*peakHeight for i in range(len(spaTypeRatios)))
params,cm = curve_fit(multiPeakPoisson,b_reduced,hist,p0=[maxKmerCountX,maxKmerCountY],bounds=([0,0],[np.amax(vals)+1,1000*maxKmerCountY]),maxfev=5000) #TODO: maxfev exc handling
x = b_centers
y = multiPeakPoisson(b_reduced,*params)
fitScore = 0
for xv,yv in zip(x,y):
fitScore += abs(yv - xv)
#print(x,y)
plt.plot(x,y,label='{}({}/{})'.format(spaType,fitScore,params))
plt.legend()
plt.savefig(snakemake.output[0])
import json
import math
from scipy.stats import poisson
import sys
import logging
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from matplotlib.ticker import MaxNLocator
logging.basicConfig(filename=snakemake.log[0], level=logging.DEBUG,format="%(asctime)s:%(levelname)s:%(message)s")
#Read counts
expected_counts = json.load(open(snakemake.input['expectedCounts'],'r'))
actual_counts = json.load(open(snakemake.input['observedCounts'],'r'))
predictedType = '???'
with open(snakemake.input['probabilities'],'r') as predictionFile:
predictedType = predictionFile.read().splitlines()[0].split('\t')[0]
kmer_error = -1
with open(snakemake.input['kmerError'],'r') as errorFile:
kmer_error = float(errorFile.read().splitlines()[0].split('\t')[1])
groundTruthType = snakemake.params['gt']
gtt_counts = expected_counts[groundTruthType]
prd_counts = expected_counts[predictedType]
gtt_correct = [actual_counts[x] for x in actual_counts if x in gtt_counts ]
gtt_error = [actual_counts[x] for x in actual_counts if not x in gtt_counts ]
prd_correct = [actual_counts[x] for x in actual_counts if x in prd_counts]
prd_error = [actual_counts[x] for x in actual_counts if not x in prd_counts ]
fig, axs = plt.subplots(2, 2, figsize=(8, 8),sharex='col')
labels, counts = np.unique(gtt_correct, return_counts=True)
axs[0,0].bar(labels, counts, align='center')
axs[0,0].set_title("GTT Correct Kmers")
labels, counts = np.unique(gtt_error, return_counts=True)
axs[0,1].bar(labels, counts, align='center')
axs[0,1].set_title("GTT Error Kmers")
labels, counts = np.unique(prd_correct, return_counts=True)
axs[1,0].bar(labels, counts, align='center')
axs[1,0].set_title("PT Correct Kmers")
labels, counts = np.unique(prd_error, return_counts=True)
axs[1,1].bar(labels, counts, align='center')
axs[1,1].set_title("PT Error Kmers")
plt.savefig(snakemake.output["errors"])
plt.clf()
#Deviation histogram
deviationsGttError = []
deviationsGttActual = []
deviationsGttSum = []
deviationsPrdError = []
deviationsPrdActual = []
deviationsPrdSum = []
epsilonGtt = sum(actual_counts[x] for x in actual_counts)*kmer_error/sum(1 for kmer in actual_counts if not kmer in gtt_counts)
epsilonPrd = sum(actual_counts[x] for x in actual_counts)*kmer_error/sum(1 for kmer in actual_counts if not kmer in prd_counts)
for kmer in actual_counts:
if kmer in gtt_counts:
deviationsGttActual.append(actual_counts[kmer]-gtt_counts[kmer])
else:
deviationsGttError.append(actual_counts[kmer]-epsilonGtt)
deviationsGttSum.append(actual_counts[kmer]-epsilonGtt if kmer not in gtt_counts else actual_counts[kmer]-gtt_counts[kmer])
if kmer in prd_counts:
deviationsPrdActual.append(actual_counts[kmer]-prd_counts[kmer])
else:
deviationsPrdError.append(actual_counts[kmer]-epsilonPrd)
deviationsPrdSum.append(actual_counts[kmer]-epsilonPrd if kmer not in prd_counts else actual_counts[kmer]-prd_counts[kmer])
fig, axs = plt.subplots(2, 3, figsize=(12, 8),sharex='col')
labels, counts = np.unique(deviationsGttError, return_counts=True)
axs[0,0].bar(labels, counts, align='center')
axs[0,0].set_title("Errors GTT, Epsilon GTT={}".format(epsilonGtt))
labels, counts = np.unique(deviationsGttActual, return_counts=True)
axs[0,1].bar(labels, counts, align='center')
axs[0,1].set_title("Actual GTT")
labels, counts = np.unique(deviationsGttSum, return_counts=True)
axs[0,2].bar(labels, counts, align='center')
axs[0,2].set_title("Sum GTT")
labels, counts = np.unique(deviationsPrdError, return_counts=True)
axs[1,0].bar(labels, counts, align='center')
axs[1,0].set_title("Errors PRD, Epsilon GTT={}".format(epsilonGtt))
labels, counts = np.unique(deviationsPrdActual, return_counts=True)
axs[1,1].bar(labels, counts, align='center')
axs[1,1].set_title("Actual PRD")
labels, counts = np.unique(deviationsPrdSum, return_counts=True)
axs[1,2].bar(labels, counts, align='center')
axs[1,2].set_title("Sum PRD")
plt.savefig(snakemake.output['deviations'])
from Bio import SeqIO
from Bio.Alphabet import generic_dna
from Bio.Seq import Seq
def cm(refGen,ctgAlgn,out):
refGenome = SeqIO.read(refGen, "fasta")
refGenomeRaw = str(refGenome.seq)
mappedContigs = '?'*len(refGenomeRaw) #Initialize with Question Marks
with open(ctgAlgn,'r') as alignmentFile:
contigMatches = alignmentFile.read().splitlines()
for contigMatch in contigMatches:
content = contigMatch.split('\t')
startingPos = int(content[3])
cm(
snakemake.input['refGen'],
snakemake.input['ctgAlgn'],
snakemake.output[0]
)
import os
from Bio import SeqIO
from Bio.Alphabet import generic_dna
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
import sys
insertionMarker = '[[[]]]'
outFile = snakemake.output['seqs']
metaFile = snakemake.output['metaInf']
frame = str(
SeqIO.read(snakemake.input['proteinAFrame'],'fasta').seq
)
seqs = SeqIO.parse(snakemake.input['spaSequences'],'fasta')
#List of Synthetic Protein A Sequences
synthSeqs = []
metaInf = []
#Determine the location of the insertionMarker
globalLeft = frame.find(insertionMarker)
for seq in seqs:
rawSeq = str(seq.seq)
sequenceStr = frame.replace(insertionMarker,rawSeq)
right = len(rawSeq)
sequence = SeqRecord(Seq(sequenceStr,generic_dna),id=seq.id,name=seq.id+'_synth',description=seq.id+' Synthetic Protein A')
synthSeqs.append(sequence)
metaInf.append([seq.id,globalLeft,globalLeft+right])
#Write Output
SeqIO.write(synthSeqs,outFile,'fasta')
with open(metaFile,'w') as outFile:
for inf in metaInf:
outFile.write(inf[0]+'\t'+str(inf[1])+'\t'+str(inf[2])+'\n')
import sys
#Step 1: Read the protein table
INDEX_START_POS = snakemake.params['idxstart']
INDEX_END_POS = snakemake.params['idxend']
INDEX_STRAND = snakemake.params['idxstrand']
INDEX_PROTEIN_ID = snakemake.params['idxprotein']
proteinAIdentifier = snakemake.params['idProtA']
spaType = snakemake.params['spaType']
startPos,endPos,strand = -1,-1,-1
#Check if protein id exists and retrieve information from protein table
with open(snakemake.input['proteintable'],'r') as tablefile:
lines = tablefile.read().splitlines()[1:] #Skip first line -> Contains only headers
for line in lines:
#Retrieve the fields that are relevant
entries = line.split('\t')
startPos = int(entries[INDEX_START_POS])
endPos = int(entries[INDEX_END_POS])
strand = entries[INDEX_STRAND]
name = entries[INDEX_PROTEIN_ID]
if name == pid:
print(name,startPos,endPos,strand)
break
else:
sys.exit(-1)
print("Did not find the entry: {} in the protein table ...".format(pid))
#Step 2: Read the reference genome
sequence = SeqIO.read(snakemake.input['referenceGenome'],'fasta')
seqPre = sequence[:startPos]
seqPost = sequence[endPos:]
#Step 3: Read synthetic protein A
proteinAs = SeqIO.to_dict(SeqIO.parse(snakemake.input['syntheticProteinAsSequences'],'fasta'))
synthProteinA = proteinAs[spaType]
#Read meta information
regionXStart = -1
regionXEnd = -1
with open(snakemake.input['syntheticProteinAsMetaFile'],'r') as metafile:
lines = metafile.read().splitlines()
lines = [line.split('\t') for line in lines]
metaInf = {
line[0] : (int(line[1]),int(line[2])) for line in lines
}
regionXStart,regionXEnd = metaInf[spaType]
#print("Reine Sequenz")
#print(synthProteinA[regionXStart:regionXEnd].seq)
#print(synthProteinA.seq)
#Calculate offset and get total coordinates for region X
regionXLength = regionXEnd - regionXStart
proteinALength = len(synthProteinA)
#print(regionXLength,proteinALength)
paddingStart = proteinALength - regionXEnd
paddingEnd = regionXStart
artificalReference = seqPre+synthProteinA.reverse_complement()+seqPost
artificalReference.id = spaType
artificalReference.description = "Artifical MRSA Genome"
genomeLength = len(artificalReference)
#print("Eingebaute Sequenz")
#print(artificalReference[paddingStart+startPos:paddingStart+startPos+regionXLength].reverse_complement().seq)
#print(artificalReference[startPos:startPos+proteinALength].reverse_complement().seq)
with open(snakemake.output['syntheticReference'],'w') as outfile:
SeqIO.write(artificalReference,outfile,'fasta')
with open(snakemake.output['syntheticReferenceMeta'],'w') as outfile:
outfile.write(str(paddingStart+startPos)+'\t'+str(paddingStart+startPos+regionXLength))
import json
import math
from scipy.stats import poisson
import sys
import logging
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from matplotlib.ticker import MaxNLocator
logging.basicConfig(filename=snakemake.log[0], level=logging.DEBUG,format="%(asctime)s:%(levelname)s:%(message)s")
#Read counts
expected_counts = json.load(open(snakemake.input['expected'],'r'))
actual_counts = json.load(open(snakemake.input['observed'],'r'))
#Create Set of valid kmers
validButUnexpectedKmers = set(k for spaTypeID in expected_counts for k in expected_counts[spaTypeID])-set(k for k in expected_counts[snakemake.params['gtt']])
withZeroCounts = [actual_counts[x] if x in actual_counts else 0 for x in validButUnexpectedKmers]
nonZeroCounts = [actual_counts[x] for x in validButUnexpectedKmers if x in actual_counts ]
fig, axs = plt.subplots(1, 2, figsize=(8, 4))
labels, counts = np.unique(withZeroCounts, return_counts=True)
axs[0].bar(labels, counts, align='center')
axs[0].set_xticks(labels)
axs[0].set_title("Mean={}".format(np.mean(withZeroCounts)))
labels, counts = np.unique(nonZeroCounts, return_counts=True)
axs[1].bar(labels, counts, align='center')
axs[1].set_xticks(labels)
axs[1].set_title("Mean={}".format(np.mean(nonZeroCounts)))
plt.savefig(snakemake.output['histogram'])
import pysam
samfile = pysam.AlignmentFile(snakemake.input['alignment'])
counts = 0
plus = 0
minus = 0
for read in samfile.fetch():
if not (read.is_unmapped or read.is_secondary):
counts += 1
if read.is_reverse:
minus += 1
else:
plus += 1
with open(snakemake.output['strandbias'],'w') as outfile:
outfile.write('{}\t{}\t{}'.format(counts,plus,minus))
import json
import re
import math
import tarfile
from Bio import SeqIO
from collections import defaultdict
import gzip
import sys
import seaborn as sns, numpy as np
import matplotlib.pyplot as plt
import scipy
import scipy.signal
from itertools import chain
import logging
logging.basicConfig(filename=snakemake.log[0], level=logging.DEBUG,format="%(asctime)s:%(levelname)s:%(message)s")
#########################################################################################
rf1 = snakemake.input['read1']
rf2 = snakemake.input['read2']
reads_1 = SeqIO.parse(rf1,'fastq')
reads_2 = SeqIO.parse(rf2,'fastq')
expectedBaseErrors = 0
totalBases = 0
for read in chain(reads_1,reads_2):
sequence = read.seq
phredScores = [10**(-x/10) for x in read.letter_annotations["phred_quality"]]
totalBases += len(sequence)
expectedBaseErrors += sum(phredScores)
expectedBaseErrorRate = expectedBaseErrors/totalBases