diff --git a/src/Model.py b/src/Model.py index 09cfb735e6a55a9f77b931a1fdd4848dd1d73317..47b9d95210ee53ac00853acb897604284c0a7cf1 100644 --- a/src/Model.py +++ b/src/Model.py @@ -6,6 +6,8 @@ import numpy as np import tensorflow as tf import os +# Disable eagre +tf.compat.v1.disable_eager_execution() class DecoderType: BestPath = 0 @@ -13,11 +15,11 @@ class DecoderType: WordBeamSearch = 2 -class Model: +class Model: "minimalistic TF model for HTR" # model constants - batchSize = 50 + batchSize = 32 imgSize = (128, 32) maxTextLen = 32 @@ -30,10 +32,10 @@ class Model: self.snapID = 0 # Whether to use normalization over a batch or a population - self.is_train = tf.placeholder(tf.bool, name='is_train') + self.is_train = tf.compat.v1.placeholder(tf.bool, name='is_train') # input image batch - self.inputImgs = tf.placeholder(tf.float32, shape=(None, Model.imgSize[0], Model.imgSize[1])) + self.inputImgs = tf.compat.v1.placeholder(tf.float32, shape=(None, Model.imgSize[0], Model.imgSize[1])) # setup CNN, RNN and CTC self.setupCNN() @@ -42,15 +44,15 @@ class Model: # setup optimizer to train NN self.batchesTrained = 0 - self.learningRate = tf.placeholder(tf.float32, shape=[]) - self.update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) + self.learningRate = tf.compat.v1.placeholder(tf.float32, shape=[]) + self.update_ops = tf.compat.v1.get_collection(tf.compat.v1.GraphKeys.UPDATE_OPS) with tf.control_dependencies(self.update_ops): - self.optimizer = tf.train.RMSPropOptimizer(self.learningRate).minimize(self.loss) + self.optimizer = tf.compat.v1.train.RMSPropOptimizer(self.learningRate).minimize(self.loss) # initialize TF (self.sess, self.saver) = self.setupTF() - + def setupCNN(self): "create CNN layers and return output of these layers" cnnIn4d = tf.expand_dims(input=self.inputImgs, axis=3) @@ -64,11 +66,11 @@ class Model: # create layers pool = cnnIn4d # input to first CNN layer for i in range(numLayers): - kernel = tf.Variable(tf.truncated_normal([kernelVals[i], kernelVals[i], featureVals[i], featureVals[i + 1]], stddev=0.1)) - conv = tf.nn.conv2d(pool, kernel, padding='SAME', strides=(1,1,1,1)) - conv_norm = tf.layers.batch_normalization(conv, training=self.is_train) + kernel = tf.Variable(tf.random.truncated_normal([kernelVals[i], kernelVals[i], featureVals[i], featureVals[i + 1]], stddev=0.1)) + conv = tf.nn.conv2d(input=pool, filters=kernel, padding='SAME', strides=(1,1,1,1)) + conv_norm = tf.compat.v1.layers.batch_normalization(conv, training=self.is_train) relu = tf.nn.relu(conv_norm) - pool = tf.nn.max_pool(relu, (1, poolVals[i][0], poolVals[i][1], 1), (1, strideVals[i][0], strideVals[i][1], 1), 'VALID') + pool = tf.nn.max_pool2d(input=relu, ksize=(1, poolVals[i][0], poolVals[i][1], 1), strides=(1, strideVals[i][0], strideVals[i][1], 1), padding='VALID') self.cnnOut4d = pool @@ -79,54 +81,54 @@ class Model: # basic cells which is used to build RNN numHidden = 256 - cells = [tf.contrib.rnn.LSTMCell(num_units=numHidden, state_is_tuple=True) for _ in range(2)] # 2 layers + cells = [tf.compat.v1.nn.rnn_cell.LSTMCell(num_units=numHidden, state_is_tuple=True) for _ in range(2)] # 2 layers # stack basic cells - stacked = tf.contrib.rnn.MultiRNNCell(cells, state_is_tuple=True) + stacked = tf.compat.v1.nn.rnn_cell.MultiRNNCell(cells, state_is_tuple=True) # bidirectional RNN # BxTxF -> BxTx2H - ((fw, bw), _) = tf.nn.bidirectional_dynamic_rnn(cell_fw=stacked, cell_bw=stacked, inputs=rnnIn3d, dtype=rnnIn3d.dtype) - + ((fw, bw), _) = tf.compat.v1.nn.bidirectional_dynamic_rnn(cell_fw=stacked, cell_bw=stacked, inputs=rnnIn3d, dtype=rnnIn3d.dtype) + # BxTxH + BxTxH -> BxTx2H -> BxTx1X2H concat = tf.expand_dims(tf.concat([fw, bw], 2), 2) - + # project output to chars (including blank): BxTx1x2H -> BxTx1xC -> BxTxC - kernel = tf.Variable(tf.truncated_normal([1, 1, numHidden * 2, len(self.charList) + 1], stddev=0.1)) + kernel = tf.Variable(tf.random.truncated_normal([1, 1, numHidden * 2, len(self.charList) + 1], stddev=0.1)) self.rnnOut3d = tf.squeeze(tf.nn.atrous_conv2d(value=concat, filters=kernel, rate=1, padding='SAME'), axis=[2]) - + def setupCTC(self): "create CTC loss and decoder and return them" # BxTxC -> TxBxC - self.ctcIn3dTBC = tf.transpose(self.rnnOut3d, [1, 0, 2]) + self.ctcIn3dTBC = tf.transpose(a=self.rnnOut3d, perm=[1, 0, 2]) # ground truth text as sparse tensor - self.gtTexts = tf.SparseTensor(tf.placeholder(tf.int64, shape=[None, 2]) , tf.placeholder(tf.int32, [None]), tf.placeholder(tf.int64, [2])) + self.gtTexts = tf.SparseTensor(tf.compat.v1.placeholder(tf.int64, shape=[None, 2]) , tf.compat.v1.placeholder(tf.int32, [None]), tf.compat.v1.placeholder(tf.int64, [2])) # calc loss for batch - self.seqLen = tf.placeholder(tf.int32, [None]) - self.loss = tf.reduce_mean(tf.nn.ctc_loss(labels=self.gtTexts, inputs=self.ctcIn3dTBC, sequence_length=self.seqLen, ctc_merge_repeated=True)) + self.seqLen = tf.compat.v1.placeholder(tf.int32, [None]) + self.loss = tf.reduce_mean(input_tensor=tf.compat.v1.nn.ctc_loss(labels=self.gtTexts, inputs=self.ctcIn3dTBC, sequence_length=self.seqLen, ctc_merge_repeated=True)) # calc loss for each element to compute label probability - self.savedCtcInput = tf.placeholder(tf.float32, shape=[Model.maxTextLen, None, len(self.charList) + 1]) - self.lossPerElement = tf.nn.ctc_loss(labels=self.gtTexts, inputs=self.savedCtcInput, sequence_length=self.seqLen, ctc_merge_repeated=True) + self.savedCtcInput = tf.compat.v1.placeholder(tf.float32, shape=[Model.maxTextLen, None, len(self.charList) + 1]) + self.lossPerElement = tf.compat.v1.nn.ctc_loss(labels=self.gtTexts, inputs=self.savedCtcInput, sequence_length=self.seqLen, ctc_merge_repeated=True) # decoder: either best path decoding or beam search decoding if self.decoderType == DecoderType.BestPath: self.decoder = tf.nn.ctc_greedy_decoder(inputs=self.ctcIn3dTBC, sequence_length=self.seqLen) elif self.decoderType == DecoderType.BeamSearch: - self.decoder = tf.nn.ctc_beam_search_decoder(inputs=self.ctcIn3dTBC, sequence_length=self.seqLen, beam_width=50, merge_repeated=False) + self.decoder = tf.nn.ctc_beam_search_decoder(inputs=self.ctcIn3dTBC, sequence_length=self.seqLen, beam_width=50) elif self.decoderType == DecoderType.WordBeamSearch: # import compiled word beam search operation (see https://github.com/githubharald/CTCWordBeamSearch) word_beam_search_module = tf.load_op_library('TFWordBeamSearch.so') - # prepare information about language (dictionary, characters in dataset, characters forming words) + # prepare information about language (dictionary, characters in dataset, characters forming words) chars = str().join(self.charList) wordChars = open('../model/wordCharList.txt').read().splitlines()[0] corpus = open('../data/corpus.txt').read() # decode using the "Words" mode of word beam search - self.decoder = word_beam_search_module.word_beam_search(tf.nn.softmax(self.ctcIn3dTBC, dim=2), 50, 'Words', 0.0, corpus.encode('utf8'), chars.encode('utf8'), wordChars.encode('utf8')) + self.decoder = word_beam_search_module.word_beam_search(tf.nn.softmax(self.ctcIn3dTBC, axis=2), 50, 'Words', 0.0, corpus.encode('utf8'), chars.encode('utf8'), wordChars.encode('utf8')) def setupTF(self): @@ -134,9 +136,9 @@ class Model: print('Python: '+sys.version) print('Tensorflow: '+tf.__version__) - sess=tf.Session() # TF session + sess=tf.compat.v1.Session() # TF session - saver = tf.train.Saver(max_to_keep=1) # saver saves model to file + saver = tf.compat.v1.train.Saver(max_to_keep=1) # saver saves model to file modelDir = '../model/' latestSnapshot = tf.train.latest_checkpoint(modelDir) # is there a saved model? @@ -150,7 +152,7 @@ class Model: saver.restore(sess, latestSnapshot) else: print('Init with new values') - sess.run(tf.global_variables_initializer()) + sess.run(tf.compat.v1.global_variables_initializer()) return (sess,saver) @@ -178,7 +180,7 @@ class Model: def decoderOutputToText(self, ctcOutput, batchSize): "extract texts from output of CTC decoder" - + # contains string of labels for each batch element encodedLabelStrs = [[] for i in range(batchSize)] @@ -193,8 +195,8 @@ class Model: # TF decoders: label strings are contained in sparse tensor else: - # ctc returns tuple, first element is SparseTensor - decoded=ctcOutput[0][0] + # ctc returns tuple, first element is SparseTensor + decoded=ctcOutput[0][0] # go over all indices and save mapping: batch -> values idxDict = { b : [] for b in range(batchSize) } @@ -241,7 +243,7 @@ class Model: def inferBatch(self, batch, calcProbability=False, probabilityOfGT=False): "feed a batch into the NN to recognize the texts" - + # decode, optionally save RNN output numBatchElements = len(batch.imgs) evalRnnOutput = self.dump or calcProbability @@ -250,7 +252,7 @@ class Model: evalRes = self.sess.run(evalList, feedDict) decoded = evalRes[0] texts = self.decoderOutputToText(decoded, numBatchElements) - + # feed RNN output and recognized text into CTC loss to compute labeling probability probs = None if calcProbability: @@ -258,18 +260,17 @@ class Model: ctcInput = evalRes[1] evalList = self.lossPerElement feedDict = {self.savedCtcInput : ctcInput, self.gtTexts : sparse, self.seqLen : [Model.maxTextLen] * numBatchElements, self.is_train: False} - lossVals = self.sess.run(evalList, feedDict) - probs = np.exp(-lossVals) + #lossVals = self.sess.run(evalList, feedDict) + #probs = np.exp(-lossVals) # dump the output of the NN to CSV file(s) if self.dump: self.dumpNNOutput(evalRes[1]) - return (texts, probs) - + return (texts) + def save(self): "save model to file" self.snapID += 1 self.saver.save(self.sess, '../model/snapshot', global_step=self.snapID) -