diff --git a/.gitignore b/.gitignore index 61969afaf401bed5fa1a554a03ae522df6da4eb5..cd55ec507f6b1a268c95085d09b3171a8e49d1fd 100644 --- a/.gitignore +++ b/.gitignore @@ -5,4 +5,5 @@ notes/ *.so *.pyc .idea/ -dump/ \ No newline at end of file +dump/ +word-data/ diff --git a/src/main.py b/src/main.py index 98f385d967d6e7910007b5bb468d28a31b482ba4..190d358ab6885353d7a6e9aade3b7619ea096648 100644 --- a/src/main.py +++ b/src/main.py @@ -10,8 +10,7 @@ from dataloader_iam import DataLoaderIAM, Batch from model import Model, DecoderType from preprocessor import Preprocessor -import os -os.environ['CUDA_VISIBLE_DEVICES'] = '-1' + class FilePaths: """Filenames and paths to data.""" diff --git a/src/model.py b/src/model.py index e2bcde6b42dac2a58556a71aa3aba4aae901db76..66066468cacdf0fc332961af27bb30539f38a794 100644 --- a/src/model.py +++ b/src/model.py @@ -1,21 +1,36 @@ + +import tensorflow as tf + + +tf.config.list_physical_devices( + device_type=None +) + +physical_devices = tf.config.list_physical_devices('GPU') +print("Num GPUs:", len(physical_devices)) + import os import sys from typing import List, Tuple import numpy as np -import tensorflow as tf - from dataloader_iam import Batch +from ctc_decoder import lexicon_search, BKTree + # Disable eager mode tf.compat.v1.disable_eager_execution() + + class DecoderType: """CTC decoder types.""" BestPath = 0 BeamSearch = 1 WordBeamSearch = 2 + LexiconSearch = 3 + class Model: @@ -33,6 +48,8 @@ class Model: self.must_restore = must_restore self.snap_ID = 0 + self.corpus = [] + # Whether to use normalization over a batch or a population self.is_train = tf.compat.v1.placeholder(tf.bool, name='is_train') @@ -129,7 +146,9 @@ class Model: self.decoder = tf.nn.ctc_greedy_decoder(inputs=self.ctc_in_3d_tbc, sequence_length=self.seq_len) elif self.decoder_type == DecoderType.BeamSearch: self.decoder = tf.nn.ctc_beam_search_decoder(inputs=self.ctc_in_3d_tbc, sequence_length=self.seq_len, - beam_width=50) + beam_width=50, ) + + # word beam search decoding (see https://github.com/githubharald/CTCWordBeamSearch) elif self.decoder_type == DecoderType.WordBeamSearch: # prepare information about language (dictionary, characters in dataset, characters forming words) @@ -144,6 +163,8 @@ class Model: # the input to the decoder must have softmax already applied self.wbs_input = tf.nn.softmax(self.ctc_in_3d_tbc, axis=2) + else: + self.wbs_input = tf.nn.softmax(self.ctc_in_3d_tbc, axis=2) def setup_tf(self) -> Tuple[tf.compat.v1.Session, tf.compat.v1.train.Saver]: """Initialize TF.""" @@ -257,7 +278,7 @@ class Model: # put tensors to be evaluated into list eval_list = [] - if self.decoder_type == DecoderType.WordBeamSearch: + if self.decoder_type >= DecoderType.WordBeamSearch: eval_list.append(self.wbs_input) else: eval_list.append(self.decoder) @@ -276,11 +297,18 @@ class Model: eval_res = self.sess.run(eval_list, feed_dict) # TF decoders: decoding already done in TF graph - if self.decoder_type != DecoderType.WordBeamSearch: + if self.decoder_type < DecoderType.WordBeamSearch: decoded = eval_res[0] # word beam search decoder: decoding is done in C++ function compute() - else: + elif self.decoder_type == DecoderType.WordBeamSearch: decoded = self.decoder.compute(eval_res[0]) + else: + bk_tree = BKTree(self.corpus) + mat = np.array(eval_res[0]) + mat = mat[:, 0, :] + + decoded = lexicon_search(mat, self.char_list, bk_tree, 50) + return decoded, 1 # map labels (numbers) to character string texts = self.decoder_output_to_text(decoded, num_batch_elements) diff --git a/src/preprocessor.py b/src/preprocessor.py index 8c956ea7c5d86c4e1597746a06ce8929a9f35fd8..407fbc4bd898b73c2b65786da74778fa991be6d5 100644 --- a/src/preprocessor.py +++ b/src/preprocessor.py @@ -140,6 +140,15 @@ class Preprocessor: else: if self.dynamic_width: ht = self.img_size[1] + #cut out first horizontal line if it is white + img = 255-img + try: + while np.sum(img[0,:]) < 4096: + img = img[1:,:] + except: + return np.ones((ht, ht)) + img=255-img + h, w = img.shape f = ht / h wt = int(f * w + self.padding) diff --git a/src/webserver.py b/src/webserver.py index d2d6dffa1709ad056135ef16d131066727fc96f1..629f56f4016389ae0c4157fdd32025b24ea9eac6 100644 --- a/src/webserver.py +++ b/src/webserver.py @@ -6,13 +6,15 @@ import dataloader_iam as htr_data_loader import preprocessor as htr_preprocessor import numpy as np from word_beam_search import WordBeamSearch +import base64 +import tensorflow as tf app = Flask(__name__) image_size = 32 -model_name = htr_model.Model(htr.char_list_from_file(), htr_model.DecoderType.WordBeamSearch, must_restore=True) - +model_name = htr_model.Model(htr.char_list_from_file(), htr_model.DecoderType.LexiconSearch, must_restore=True) +model_name.setup_ctc csv_path = '../tns.csv' char_list = htr.char_list_from_file() @@ -39,13 +41,14 @@ def predictNach(): batch = htr_data_loader.Batch([processed_image], None, 1) #change corpus for name - corpus = open('../data/Nachname.txt').read() - model_name.decoder = WordBeamSearch(50, 'Words', 0.0, corpus.encode('utf8'), chars.encode('utf8'), - word_chars.encode('utf8')) + model_name.corpus = open('../data/Nachname.txt').read().split() + #model_name.decoder = WordBeamSearch(50, 'Words', 0.0, corpus.encode('utf8'), chars.encode('utf8'),word_chars.encode('utf8')) recognized, probability = htr_model.Model.infer_batch(model_name, batch) - #convert corpus to list, split at space - corpus = corpus.split() + + """ #convert corpus to list, split at space + corpus = model_name.corpus result_list=[] + print(recognized) for name in recognized: indecies = [] for i in range(len(corpus)): @@ -53,12 +56,33 @@ def predictNach(): indecies.append(i) if len(indecies) == 0: indecies.append(-1) - result_list.append((name, indecies)) - + else: + result_list.append((name, indecies)) + if len(result_list) == 0: + result_list.append((-1, -1)) """ + + + + processed_image = processed_image + 0.5 + processed_image = processed_image * 255 + #rotate image 90 degrees + processed_image = np.rot90(processed_image,3) + #mirror image + processed_image = np.fliplr(processed_image) + height, width = processed_image.shape + image = np.reshape(processed_image,(height*width)) + image = np.append(image,height) + image = np.append(image,width) + image = image.astype(np.uint64) + array_bytes = image.tobytes() + image_base64 = base64.b64encode(array_bytes).decode('utf-8') + + result = { - 'recognized': result_list, + 'recognized': recognized, + 'image': image_base64 } return jsonify(result) @@ -72,18 +96,21 @@ def predictVor(): image_array = image_array[:-2] image_array = image_array.reshape((h, w)) preprocessor = htr_preprocessor.Preprocessor(htr.get_img_size(), dynamic_width=True, padding=16) - print(image_array.shape) processed_image = preprocessor.process_img(image_array) batch = htr_data_loader.Batch([processed_image], None, 1) #change corpus for name - corpus = open('../data/Vorname.txt').read() - model_name.decoder = WordBeamSearch(50, 'Words', 0.0, corpus.encode('utf8'), chars.encode('utf8'), - word_chars.encode('utf8')) + model_name.corpus = open('../data/Vorname.txt').read().split() + + + + + #model_name.decoder = WordBeamSearch(50, 'Words', 0.0, corpus.encode('utf8'), chars.encode('utf8'),word_chars.encode('utf8')) recognized, probability = htr_model.Model.infer_batch(model_name, batch) - #convert corpus to list, split at space - corpus = corpus.split() + + + """ corpus = model_name.corpus result_list=[] for name in recognized: indecies = [] @@ -92,13 +119,34 @@ def predictVor(): indecies.append(i) if len(indecies) == 0: indecies.append(-1) - result_list.append((name, indecies)) + else: + result_list.append((name, indecies)) + if len(result_list) == 0: + result_list.append(('KeinName', -1)) """ + + + + processed_image = processed_image + 0.5 + processed_image = processed_image * 255 + #rotate image -90 degrees + processed_image = np.rot90(processed_image,3) + #mirror image + processed_image = np.fliplr(processed_image) + height, width = processed_image.shape + image = np.reshape(processed_image,(height*width)) + image = np.append(image,height) + image = np.append(image,width) + image = image.astype(np.uint64) + array_bytes = image.tobytes() + image_base64 = base64.b64encode(array_bytes).decode('utf-8') + result = { - 'recognized': result_list, + 'recognized': recognized, + 'image': image_base64 } return jsonify(result)