diff --git a/README.md b/README.md index 3d8db0a9da10ede7d8ef76f2ad3849eae5be941d..dee51b2f401e272933c081a9339de4a541d2e065 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ # Handwritten Text Recognition with TensorFlow -* **Update 2021: more robust model, faster dataloader, Python3 only** +* **Update 2021: more robust model, faster dataloader, word beam search decoder also available for Windows** * **Update 2020: code is compatible with TF2** @@ -41,7 +41,7 @@ If neither `--train` nor `--validate` is specified, the NN infers the text from ## Integrate word beam search decoding -It is possible to use the [word beam search decoder](https://repositum.tuwien.ac.at/obvutwoa/download/pdf/2774578) instead of the two decoders shipped with TF. +The [word beam search decoder](https://repositum.tuwien.ac.at/obvutwoa/download/pdf/2774578) can be used instead of the two decoders shipped with TF. Words are constrained to those contained in a dictionary, but arbitrary non-word character strings (numbers, punctuation marks) can still be recognized. The following illustration shows a sample for which word beam search is able to recognize the correct text, while the other decoders fail. @@ -50,12 +50,11 @@ The following illustration shows a sample for which word beam search is able to Follow these instructions to integrate word beam search decoding: 1. Clone repository [CTCWordBeamSearch](https://github.com/githubharald/CTCWordBeamSearch) -2. Compile custom TF operation (follow instructions given in README) -3. Copy binary `TFWordBeamSearch.so` from the CTCWordBeamSearch repository to the `src` directory of the SimpleHTR repository +2. Compile and install by running `pip install .` at the root level of the CTCWordBeamSearch repository +3. Specify the command line option `--decoder wordbeamsearch` when executing `main.py` to actually use the decoder -Word beam search can now be enabled by setting the corresponding command line argument. -The dictionary is created (in training and validation mode) by using all words contained in the IAM dataset (i.e. also including words from validation set) and is saved into the file `data/corpus.txt`. -Further, the (manually created) list of word-characters can be found in the file `model/wordCharList.txt`. +The dictionary is automatically created in training and validation mode by using all words contained in the IAM dataset (i.e. also including words from validation set) and is saved into the file `data/corpus.txt`. +Further, the manually created list of word-characters can be found in the file `model/wordCharList.txt`. Beam width is set to 50 to conform with the beam width of vanilla beam search decoding. @@ -83,11 +82,12 @@ The database LMDB is used to speed up image loading: * A subfolder `lmdb` is created in the IAM data directory containing the LMDB files * When training the model, add the command line option `--fast` +The dataset should be located on an SSD drive. Using the `--fast` option and a GTX 1050 Ti training takes around 3h with a batch size of 500. + ## Information about model -### Overview The model is a stripped-down version of the HTR system I implemented for [my thesis]((https://repositum.tuwien.ac.at/obvutwhs/download/pdf/2874742)). What remains is what I think is the bare minimum to recognize text with an acceptable accuracy. It consists of 5 CNN layers, 2 RNN (LSTM) layers and the CTC loss and decoding layer. @@ -103,7 +103,6 @@ The illustration below gives an overview of the NN (green: operations, pink: dat ## FAQ -* I get the error message "... TFWordBeamSearch.so: cannot open shared object file: No such file or directory": if you want to use word beam search decoding, you have to compile the custom TF operation from source * Where can I find the file `words.txt` of the IAM dataset: it is located in the subfolder `ascii` on the IAM website * I want to recognize the text contained in a text-line: the model is too small for this, you have to first segment the line into words, e.g. using the model from the [WordDetectorNN](https://github.com/githubharald/WordDetectorNN) repository * I get an error when running the script more than once from an interactive Python session: do **not** call function `main()` in file `main.py` from an interactive session, as the TF computation graph is created multiple times when calling `main()` multiple times. Run the script by executing `python main.py` instead diff --git a/src/Model.py b/src/Model.py index 6dda0dd14b24fda80537714d0e6cac559d26f796..fcacde7f75f91dadb97e800b9c0f5cf52162d4ba 100644 --- a/src/Model.py +++ b/src/Model.py @@ -118,25 +118,26 @@ class Model: self.lossPerElement = tf.compat.v1.nn.ctc_loss(labels=self.gtTexts, inputs=self.savedCtcInput, sequence_length=self.seqLen, ctc_merge_repeated=True) - # decoder: either best path decoding or beam search decoding + # best path decoding or beam search decoding if self.decoderType == DecoderType.BestPath: self.decoder = tf.nn.ctc_greedy_decoder(inputs=self.ctcIn3dTBC, sequence_length=self.seqLen) elif self.decoderType == DecoderType.BeamSearch: self.decoder = tf.nn.ctc_beam_search_decoder(inputs=self.ctcIn3dTBC, sequence_length=self.seqLen, beam_width=50) + # word beam search decoding (see https://github.com/githubharald/CTCWordBeamSearch) elif self.decoderType == DecoderType.WordBeamSearch: - # import compiled word beam search operation (see https://github.com/githubharald/CTCWordBeamSearch) - word_beam_search_module = tf.load_op_library('TFWordBeamSearch.so') - # prepare information about language (dictionary, characters in dataset, characters forming words) chars = str().join(self.charList) wordChars = open('../model/wordCharList.txt').read().splitlines()[0] corpus = open('../data/corpus.txt').read() # decode using the "Words" mode of word beam search - self.decoder = word_beam_search_module.word_beam_search(tf.nn.softmax(self.ctcIn3dTBC, axis=2), 50, 'Words', - 0.0, corpus.encode('utf8'), chars.encode('utf8'), - wordChars.encode('utf8')) + from word_beam_search import WordBeamSearch + self.decoder = WordBeamSearch(50, 'Words', 0.0, corpus.encode('utf8'), chars.encode('utf8'), + wordChars.encode('utf8')) + + # the input to the decoder must have softmax already applied + self.wbsInput = tf.nn.softmax(self.ctcIn3dTBC, axis=2) def setupTF(self): "initialize TF" @@ -186,39 +187,34 @@ class Model: def decoderOutputToText(self, ctcOutput, batchSize): "extract texts from output of CTC decoder" - # contains string of labels for each batch element - encodedLabelStrs = [[] for i in range(batchSize)] - - # word beam search: label strings terminated by blank + # word beam search: already contains label strings if self.decoderType == DecoderType.WordBeamSearch: - blank = len(self.charList) - for b in range(batchSize): - for label in ctcOutput[b]: - if label == blank: - break - encodedLabelStrs[b].append(label) + labelStrs = ctcOutput # TF decoders: label strings are contained in sparse tensor else: # ctc returns tuple, first element is SparseTensor decoded = ctcOutput[0][0] + # contains string of labels for each batch element + labelStrs = [[] for _ in range(batchSize)] + # go over all indices and save mapping: batch -> values - idxDict = {b: [] for b in range(batchSize)} for (idx, idx2d) in enumerate(decoded.indices): label = decoded.values[idx] batchElement = idx2d[0] # index according to [b,t] - encodedLabelStrs[batchElement].append(label) + labelStrs[batchElement].append(label) # map labels to chars for all batch elements - return [str().join([self.charList[c] for c in labelStr]) for labelStr in encodedLabelStrs] + return [str().join([self.charList[c] for c in labelStr]) for labelStr in labelStrs] def trainBatch(self, batch): "feed a batch into the NN to train it" numBatchElements = len(batch.imgs) sparse = self.toSparse(batch.gtTexts) evalList = [self.optimizer, self.loss] - feedDict = {self.inputImgs: batch.imgs, self.gtTexts: sparse, self.seqLen: [Model.maxTextLen] * numBatchElements, self.is_train: True} + feedDict = {self.inputImgs: batch.imgs, self.gtTexts: sparse, + self.seqLen: [Model.maxTextLen] * numBatchElements, self.is_train: True} _, lossVal = self.sess.run(evalList, feedDict) self.batchesTrained += 1 return lossVal @@ -247,12 +243,33 @@ class Model: # decode, optionally save RNN output numBatchElements = len(batch.imgs) - evalRnnOutput = self.dump or calcProbability - evalList = [self.decoder] + ([self.ctcIn3dTBC] if evalRnnOutput else []) + + # put tensors to be evaluated into list + evalList = [] + + if self.decoderType == DecoderType.WordBeamSearch: + evalList.append(self.wbsInput) + else: + evalList.append(self.decoder) + + if self.dump or calcProbability: + evalList.append(self.ctcIn3dTBC) + + # dict containing all tensor fed into the model feedDict = {self.inputImgs: batch.imgs, self.seqLen: [Model.maxTextLen] * numBatchElements, self.is_train: False} + + # evaluate model evalRes = self.sess.run(evalList, feedDict) - decoded = evalRes[0] + + # TF decoders: decoding already done in TF graph + if self.decoderType != DecoderType.WordBeamSearch: + decoded = evalRes[0] + # word beam search decoder: decoding is done in C++ function compute() + else: + decoded = self.decoder.compute(evalRes[0]) + + # map labels (numbers) to character string texts = self.decoderOutputToText(decoded, numBatchElements) # feed RNN output and recognized text into CTC loss to compute labeling probability @@ -270,7 +287,7 @@ class Model: if self.dump: self.dumpNNOutput(evalRes[1]) - return (texts, probs) + return texts, probs def save(self): "save model to file"