Merge branch 'port-to-v3'

OCR-D · Feb 12, 2025 · 65a88b6 · 65a88b6
2 parents 17cefc6 + 7d69279
commit 65a88b6
Show file tree

Hide file tree

Showing 17 changed files with 643 additions and 608 deletions.
diff --git a/.circleci/config.yml b/.circleci/config.yml
@@ -47,9 +47,10 @@ jobs:
 
 workflows:
   version: 2
-  build:
-    jobs:
-      - build-python38
-      - build-python39
-      - build-python310
-      - build-python311
+  # build:
+  #   jobs:
+  #     # Disabled as of 2025-02-12 due to erratic behavior (CI in GH Action does work)
+  #     # - build-python38
+  #     # - build-python39
+  #     # - build-python310
+  #     # - build-python311
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -15,7 +15,8 @@ jobs:
     #
     # Related issue: https://github.com/actions/runner-images/issues/672.
     # runs-on: ubuntu-latest
-    runs-on: macos-latest
+    # runs-on: macos-latest
+    runs-on: ubuntu-latest
     strategy:
       fail-fast: false
       matrix:

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -5,6 +5,12 @@ Versioned according to [Semantic Versioning](http://semver.org/).
 
 ## Unreleased
 
+Changed:
+
+  * Port the processors to use the OCR-D/core v3 API, #44
+  * Spawn background processes for segmentation and recognition, #44
+  * Refactor tests and test under various conditions (w/o METS caching, page parallel processing), #44
+
 ## [0.4.1] - 2024-05-29
 
 Fixed:

diff --git a/Makefile b/Makefile
@@ -71,7 +71,7 @@ docker:
 
 # Run test
 test: tests/assets
-	$(PYTHON) -m pytest tests $(PYTEST_ARGS)
+	$(PYTHON) -m pytest  tests --durations=0 $(PYTEST_ARGS)
 
 #
 # Assets

diff --git a/ocrd_kraken/binarize.py b/ocrd_kraken/binarize.py
@@ -1,26 +1,28 @@
 from __future__ import absolute_import
-import os
+from os.path import join
+from typing import Optional
+
 import kraken.binarization
+
+from ocrd.processor.base import OcrdPageResult
+from ocrd.processor.ocrd_page_result import OcrdPageResultImage
+
 from ocrd import Processor
-from ocrd_utils import getLogger, make_file_id, MIMETYPE_PAGE
-from ocrd_models.ocrd_page import AlternativeImageType, to_xml
+from ocrd_utils import assert_file_grp_cardinality, getLogger, make_file_id, MIMETYPE_PAGE
+from ocrd_models.ocrd_page import AlternativeImageType, OcrdPage, to_xml
 from ocrd_modelfactory import page_from_file
 
-from ocrd_kraken.config import OCRD_TOOL
-
 
 class KrakenBinarize(Processor):
 
-    def __init__(self, *args, **kwargs):
-        kwargs['ocrd_tool'] = OCRD_TOOL['tools']['ocrd-kraken-binarize']
-        kwargs['version'] = OCRD_TOOL['version']
-        super(KrakenBinarize, self).__init__(*args, **kwargs)
+    @property
+    def executable(self):
+        return 'ocrd-kraken-binarize'
 
-    def process(self):
+    def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional[str] = None) -> OcrdPageResult:
         """Binarize the pages/regions/lines with Kraken.
 
-        Open and deserialise PAGE input files and their respective images,
-        then iterate over the element hierarchy down to the requested
+        Iterate over the input PAGE element hierarchy down to the requested
         ``level-of-operation``.
 
         Next, for each file, crop each segment image according to the layout
@@ -36,64 +38,37 @@ def process(self):
 
         Produce a new output file by serialising the resulting hierarchy.
         """
-        log = getLogger('processor.KrakenBinarize')
-        log.debug('Level of operation: "%s"', self.parameter['level-of-operation'])
-        log.debug('Input file group %s', self.input_file_grp)
-        log.debug('Input files %s', [str(f) for f in self.input_files])
-        for (n, input_file) in enumerate(self.input_files):
-            log.info("INPUT FILE %i / %s", n, input_file.pageId or input_file.ID)
-            file_id = make_file_id(input_file, self.output_file_grp)
-            pcgts = page_from_file(self.workspace.download_file(input_file))
-            page = pcgts.get_Page()
-            page_id = pcgts.pcGtsId or input_file.pageId or input_file.ID # (PageType has no id)
-            self.add_metadata(pcgts)
+        assert self.workspace
+        assert self.output_file_grp
+        self.logger.debug('Level of operation: "%s"', self.parameter['level-of-operation'])
 
-            page_image, page_coords, page_image_info = self.workspace.image_from_page(
-                page, page_id, feature_filter='binarized')
-            if self.parameter['level-of-operation'] == 'page':
-                log.info("Binarizing page '%s'", page_id)
-                bin_image = kraken.binarization.nlbin(page_image)
-                file_path = self.workspace.save_image_file(
-                    bin_image, file_id + '.IMG-BIN',
-                    self.output_file_grp,
-                    page_id=input_file.pageId)
-                page.add_AlternativeImage(AlternativeImageType(
-                    filename=file_path,
-                    comments=page_coords['features'] + ',binarized'))
-            else:
-                for region in page.get_AllRegions(classes=['Text']):
-                    region_image, region_coords = self.workspace.image_from_segment(
-                        region, page_image, page_coords, feature_filter='binarized')
-                    if self.parameter['level-of-operation'] == 'region':
-                        log.info("Binarizing region '%s'", region.id)
-                        bin_image = kraken.binarization.nlbin(region_image)
-                        file_path = self.workspace.save_image_file(
-                            bin_image, file_id + '_' + region.id + '.IMG-BIN',
-                            self.output_file_grp,
-                            page_id=input_file.pageId)
-                        region.add_AlternativeImage(AlternativeImageType(
-                            filename=file_path,
-                            comments=region_coords['features'] + ',binarized'))
-                    else:
-                        for line in region.get_TextLine():
-                            line_image, line_coords = self.workspace.image_from_segment(
-                                line, region_image, region_coords, feature_filter='binarized')
-                            log.info("Binarizing line '%s'", line.id)
-                            bin_image = kraken.binarization.nlbin(line_image)
-                            file_path = self.workspace.save_image_file(
-                                bin_image, file_id + '_' + region.id + '_' + line.id + '.IMG-BIN',
-                                self.output_file_grp,
-                                page_id=input_file.pageId)
-                            line.add_AlternativeImage(AlternativeImageType(
-                                filename=file_path,
-                                comments=line_coords['features'] + ',binarized'))
-            # update METS (add the PAGE file):
-            file_path = os.path.join(self.output_file_grp, file_id + '.xml')
-            pcgts.set_pcGtsId(file_id)
-            out = self.workspace.add_file(
-                ID=file_id,
-                file_grp=self.output_file_grp,
-                pageId=input_file.pageId,
-                local_filename=file_path,
-                mimetype=MIMETYPE_PAGE,
-                content=to_xml(pcgts))
+        pcgts = input_pcgts[0]
+        assert pcgts
+        page = pcgts.get_Page()
+        assert page
+        page_image, page_xywh, _ = self.workspace.image_from_page(
+            page, page_id, feature_filter='binarized')
+        result = OcrdPageResult(pcgts)
+        if self.parameter['level-of-operation'] == 'page':
+            self.logger.info("Binarizing page '%s'", page_id)
+            alternative_image = AlternativeImageType(comments=f'{page_xywh["features"]},binarized')
+            page.add_AlternativeImage(alternative_image)
+            result.images.append(OcrdPageResultImage(kraken.binarization.nlbin(page_image), '.IMG-BIN', alternative_image))
+        else:
+            for region in page.get_AllRegions(classes=['Text']):
+                region_image, region_xywh = self.workspace.image_from_segment(
+                    region, page_image, page_xywh, feature_filter='binarized')
+                if self.parameter['level-of-operation'] == 'region':
+                    self.logger.info("Binarizing region '%s'", region.id)
+                    alternative_image = AlternativeImageType(comments=f'{region_xywh["features"]},binarized')
+                    region.add_AlternativeImage(alternative_image)
+                    result.images.append(OcrdPageResultImage(kraken.binarization.nlbin(region_image), f'{region.id}.IMG-BIN', alternative_image))
+                else:
+                    for line in region.get_TextLine():
+                        line_image, line_xywh = self.workspace.image_from_segment(
+                            line, region_image, region_xywh, feature_filter='binarized')
+                        self.logger.info("Binarizing line '%s'", line.id)
+                        alternative_image = AlternativeImageType(comments=f'{line_xywh["features"]},binarized')
+                        line.add_AlternativeImage(alternative_image)
+                        result.images.append(OcrdPageResultImage(kraken.binarization.nlbin(line_image), f'{region.id}_{line.id}.IMG-BIN', alternative_image))
+        return result
diff --git a/ocrd_kraken/cli.py b/ocrd_kraken/cli.py
diff --git a/ocrd_kraken/common.py b/ocrd_kraken/common.py
@@ -0,0 +1,76 @@
+import multiprocessing as mp
+
+from ocrd_utils import config, initLogging
+
+class KrakenPredictor(mp.context.SpawnProcess):
+    def __init__(self, logger, parameter):
+        self.logger = logger
+        self.parameter = parameter
+        ctxt = mp.get_context('spawn')
+        self.taskq = ctxt.Queue(maxsize=1 + config.OCRD_MAX_PARALLEL_PAGES)
+        self.resultq = ctxt.Queue(maxsize=1 + config.OCRD_MAX_PARALLEL_PAGES)
+        self.terminate = ctxt.Event()
+        ctxt = mp.get_context('fork') # base.Processor will fork workers
+        self.results = ctxt.Manager().dict()
+        super().__init__()
+        self.daemon = True
+    def __call__(self, page_id, *page_input):
+        self.taskq.put((page_id, page_input))
+        self.logger.debug("sent task for '%s'", page_id)
+        #return self.get(page_id)
+        result = self.get(page_id)
+        self.logger.debug("received result for '%s'", page_id)
+        return result
+    def get(self, page_id):
+        while not self.terminate.is_set():
+            if page_id in self.results:
+                result = self.results.pop(page_id)
+                if isinstance(result, Exception):
+                    raise Exception(f"predictor failed for {page_id}") from result
+                return result
+            try:
+                page_id, result = self.resultq.get(timeout=0.7)
+            except mp.queues.Empty:
+                continue
+            self.logger.debug("storing results for '%s'", page_id)
+            self.results[page_id] = result
+        raise Exception(f"predictor terminated while waiting on results for {page_id}")
+    def run(self):
+        initLogging()
+        try:
+            self.setup()
+        except Exception as e:
+            self.logger.exception("setup failed")
+            self.terminate.set()
+        while not self.terminate.is_set():
+            try:
+                page_id, page_input = self.taskq.get(timeout=1.1)
+            except mp.queues.Empty:
+                continue
+            self.logger.debug("predicting '%s'", page_id)
+            try:
+                page_output = self.predict(*page_input)
+            except Exception as e:
+                self.logger.error("prediction failed: %s", e.__class__.__name__)
+                page_output = e
+            self.resultq.put((page_id, page_output))
+            self.logger.debug("sent result for '%s'", page_id)
+        self.resultq.close()
+        self.resultq.cancel_join_thread()
+        self.logger.debug("predictor terminated")
+    def setup(self):
+        raise NotImplementedError()
+    def predict(self, *inputs):
+        raise NotImplementedError()
+    def shutdown(self):
+        # do not terminate from forked processor instances
+        if mp.parent_process() is None:
+            self.terminate.set()
+            self.taskq.close()
+            self.taskq.cancel_join_thread()
+            self.logger.debug(f"terminated {self} in {mp.current_process().name}")
+        else:
+            self.logger.debug(f"not touching {self} in {mp.current_process().name}")
+    def __del__(self):
+        self.logger.debug(f"deinit of {self} in {mp.current_process().name}")
+        self.shutdown()
diff --git a/ocrd_kraken/config.py b/ocrd_kraken/config.py
diff --git a/ocrd_kraken/ocrd-tool.json b/ocrd_kraken/ocrd-tool.json
@@ -4,8 +4,8 @@
   "tools": {
     "ocrd-kraken-binarize": {
       "executable": "ocrd-kraken-binarize",
-      "input_file_grp": ["OCR-D-IMG", "OCR-D-PRE-CROP", "OCR-D-SEG-REGION", "OCR-D-SEG-LINE"],
-      "output_file_grp": ["OCR-D-PRE-BIN"],
+      "input_file_grp_cardinality": 1,
+      "output_file_grp_cardinality": 1,
       "categories": [
         "Image preprocessing"
       ],
@@ -24,8 +24,8 @@
     },
     "ocrd-kraken-segment": {
       "executable": "ocrd-kraken-segment",
-      "input_file_grp": ["OCR-D-IMG", "OCR-D-PRE-CROP", "OCR-D-PRE-BIN"],
-      "output_file_grp": ["OCR-D-SEG-REGION", "OCR-D-SEG-LINE"],
+      "input_file_grp_cardinality": 1,
+      "output_file_grp_cardinality": 1,
       "categories": [
         "Layout analysis"
       ],
@@ -128,8 +128,8 @@
     },
     "ocrd-kraken-recognize": {
       "executable": "ocrd-kraken-recognize",
-      "input_file_grp": ["OCR-D-SEG-LINE"],
-      "output_file_grp": ["OCR-D-OCR-KRAK"],
+      "input_file_grp_cardinality": 1,
+      "output_file_grp_cardinality": 1,
       "categories": ["Text recognition and optimization"],
       "steps": ["recognition/text-recognition"],
       "description": "Text recognition with Kraken",