added lexicon search for Name recognition

2e6282f0 · fabian · abf00eb0 · 2e6282f0 · 2e6282f0 · 2e6282f0
Commit 2e6282f0 authored 1 year ago by fabian
--- a/.gitignore
+++ b/.gitignore
@@ -6,3 +6,4 @@ notes/
 *.pyc
 .idea/
 dump/
+word-data/
--- a/src/main.py
+++ b/src/main.py
@@ -10,8 +10,7 @@ from dataloader_iam import DataLoaderIAM, Batch
 from model import Model, DecoderType
 from preprocessor import Preprocessor
-import os
-os.environ['CUDA_VISIBLE_DEVICES'] = '-1'
 class FilePaths:
    """Filenames and paths to data."""

--- a/src/model.py
+++ b/src/model.py
+import tensorflow as tf
+tf.config.list_physical_devices(
+    device_type=None
+)
+physical_devices = tf.config.list_physical_devices('GPU')
+print("Num GPUs:", len(physical_devices))
 import os
 import sys
 from typing import List, Tuple
 import numpy as np
-import tensorflow as tf
 from dataloader_iam import Batch
+from ctc_decoder import lexicon_search, BKTree
 # Disable eager mode
 tf.compat.v1.disable_eager_execution()
 class DecoderType:
    """CTC decoder types."""
    BestPath = 0
    BeamSearch = 1
    WordBeamSearch = 2
+    LexiconSearch = 3
 class Model:
@@ -33,6 +48,8 @@ class Model:
        self.must_restore = must_restore
        self.snap_ID = 0
+        self.corpus = []
        # Whether to use normalization over a batch or a population
        self.is_train = tf.compat.v1.placeholder(tf.bool, name='is_train')
@@ -129,7 +146,9 @@ class Model:
            self.decoder = tf.nn.ctc_greedy_decoder(inputs=self.ctc_in_3d_tbc, sequence_length=self.seq_len)
        elif self.decoder_type == DecoderType.BeamSearch:
            self.decoder = tf.nn.ctc_beam_search_decoder(inputs=self.ctc_in_3d_tbc, sequence_length=self.seq_len,
-                                                         beam_width=50)
+                                                         beam_width=50, )
        # word beam search decoding (see https://github.com/githubharald/CTCWordBeamSearch)
        elif self.decoder_type == DecoderType.WordBeamSearch:
            # prepare information about language (dictionary, characters in dataset, characters forming words)
@@ -144,6 +163,8 @@ class Model:
            # the input to the decoder must have softmax already applied
            self.wbs_input = tf.nn.softmax(self.ctc_in_3d_tbc, axis=2)
+        else:
+            self.wbs_input = tf.nn.softmax(self.ctc_in_3d_tbc, axis=2)
    def setup_tf(self) -> Tuple[tf.compat.v1.Session, tf.compat.v1.train.Saver]:
        """Initialize TF."""
@@ -257,7 +278,7 @@ class Model:
        # put tensors to be evaluated into list
        eval_list = []
-        if self.decoder_type == DecoderType.WordBeamSearch:
+        if self.decoder_type >= DecoderType.WordBeamSearch:
            eval_list.append(self.wbs_input)
        else:
            eval_list.append(self.decoder)
@@ -276,11 +297,18 @@ class Model:
        eval_res = self.sess.run(eval_list, feed_dict)
        # TF decoders: decoding already done in TF graph
-        if self.decoder_type != DecoderType.WordBeamSearch:
+        if self.decoder_type < DecoderType.WordBeamSearch:
            decoded = eval_res[0]
        # word beam search decoder: decoding is done in C++ function compute()
-        else:
+        elif self.decoder_type == DecoderType.WordBeamSearch:
            decoded = self.decoder.compute(eval_res[0])
+        else:
+            bk_tree = BKTree(self.corpus)
+            mat = np.array(eval_res[0])
+            mat = mat[:, 0, :]
+            decoded = lexicon_search(mat, self.char_list, bk_tree, 50)
+            return decoded, 1
        # map labels (numbers) to character string
        texts = self.decoder_output_to_text(decoded, num_batch_elements)

--- a/src/preprocessor.py
+++ b/src/preprocessor.py
@@ -140,6 +140,15 @@ class Preprocessor:
        else:
            if self.dynamic_width:
                ht = self.img_size[1]
+                #cut out first horizontal line if it is white
+                img = 255-img
+                try:
+                    while np.sum(img[0,:]) < 4096:
+                        img = img[1:,:]
+                except:
+                    return np.ones((ht, ht))
+                img=255-img
                h, w = img.shape
                f = ht / h
                wt = int(f * w + self.padding)

--- a/src/webserver.py
+++ b/src/webserver.py
@@ -6,13 +6,15 @@ import dataloader_iam as htr_data_loader
 import preprocessor as htr_preprocessor
 import numpy as np
 from word_beam_search import WordBeamSearch
+import base64
+import tensorflow as tf
 app = Flask(__name__)
 image_size = 32
-model_name = htr_model.Model(htr.char_list_from_file(), htr_model.DecoderType.WordBeamSearch, must_restore=True)
+model_name = htr_model.Model(htr.char_list_from_file(), htr_model.DecoderType.LexiconSearch, must_restore=True)
+model_name.setup_ctc
 csv_path = '../tns.csv'
 char_list = htr.char_list_from_file()
@@ -39,13 +41,14 @@ def predictNach():
    batch = htr_data_loader.Batch([processed_image], None, 1)
    #change corpus for name
-    corpus = open('../data/Nachname.txt').read()
+    model_name.corpus = open('../data/Nachname.txt').read().split()
-    model_name.decoder = WordBeamSearch(50, 'Words', 0.0, corpus.encode('utf8'), chars.encode('utf8'),
+    #model_name.decoder = WordBeamSearch(50, 'Words', 0.0, corpus.encode('utf8'), chars.encode('utf8'),word_chars.encode('utf8'))
-                                    word_chars.encode('utf8'))
    recognized, probability = htr_model.Model.infer_batch(model_name, batch)
-    #convert corpus to list, split at space
-    corpus = corpus.split()
+    """     #convert corpus to list, split at space
+    corpus = model_name.corpus
    result_list=[]
+    print(recognized)
    for name in recognized:
        indecies = []
        for i in range(len(corpus)):
@@ -53,12 +56,33 @@ def predictNach():
                indecies.append(i)
        if len(indecies) == 0:
            indecies.append(-1)
+        else:
            result_list.append((name, indecies))
+    if len(result_list) == 0:
+        result_list.append((-1, -1)) """
+    processed_image = processed_image + 0.5
+    processed_image = processed_image * 255
+    #rotate image 90 degrees
+    processed_image = np.rot90(processed_image,3)
+    #mirror image
+    processed_image = np.fliplr(processed_image)
+    height, width = processed_image.shape
+    image = np.reshape(processed_image,(height*width))
+    image = np.append(image,height)
+    image = np.append(image,width)
+    image = image.astype(np.uint64)
+    array_bytes = image.tobytes()
+    image_base64 = base64.b64encode(array_bytes).decode('utf-8')
    result = {
-        'recognized': result_list,
+        'recognized': recognized,
+        'image': image_base64
    }
    return jsonify(result)
@@ -72,18 +96,21 @@ def predictVor():
    image_array = image_array[:-2]
    image_array = image_array.reshape((h, w))
    preprocessor = htr_preprocessor.Preprocessor(htr.get_img_size(), dynamic_width=True, padding=16)
-    print(image_array.shape)
    processed_image = preprocessor.process_img(image_array)
    batch = htr_data_loader.Batch([processed_image], None, 1)
    #change corpus for name
-    corpus = open('../data/Vorname.txt').read()
+    model_name.corpus = open('../data/Vorname.txt').read().split()
-    model_name.decoder = WordBeamSearch(50, 'Words', 0.0, corpus.encode('utf8'), chars.encode('utf8'),
-                                    word_chars.encode('utf8'))
+    #model_name.decoder = WordBeamSearch(50, 'Words', 0.0, corpus.encode('utf8'), chars.encode('utf8'),word_chars.encode('utf8'))
    recognized, probability = htr_model.Model.infer_batch(model_name, batch)
-    #convert corpus to list, split at space
-    corpus = corpus.split()
+    """     corpus = model_name.corpus
    result_list=[]
    for name in recognized:
        indecies = []
@@ -92,13 +119,34 @@ def predictVor():
                indecies.append(i)
        if len(indecies) == 0:
            indecies.append(-1)
+        else:
            result_list.append((name, indecies))
+    if len(result_list) == 0:
+        result_list.append(('KeinName', -1)) """
+    processed_image = processed_image + 0.5
+    processed_image = processed_image * 255
+    #rotate image -90 degrees
+    processed_image = np.rot90(processed_image,3)
+    #mirror image
+    processed_image = np.fliplr(processed_image)
+    height, width = processed_image.shape
+    image = np.reshape(processed_image,(height*width))
+    image = np.append(image,height)
+    image = np.append(image,width)
+    image = image.astype(np.uint64)
+    array_bytes = image.tobytes()
+    image_base64 = base64.b64encode(array_bytes).decode('utf-8')
    result = {
-        'recognized': result_list,
+        'recognized': recognized,
+        'image': image_base64
    }
    return jsonify(result)