Skip to content

Commit

Permalink
Merge branch 'port-to-v3'
Browse files Browse the repository at this point in the history
  • Loading branch information
kba committed Feb 12, 2025
2 parents 17cefc6 + 7d69279 commit 65a88b6
Show file tree
Hide file tree
Showing 17 changed files with 643 additions and 608 deletions.
13 changes: 7 additions & 6 deletions .circleci/config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -47,9 +47,10 @@ jobs:

workflows:
version: 2
build:
jobs:
- build-python38
- build-python39
- build-python310
- build-python311
# build:
# jobs:
# # Disabled as of 2025-02-12 due to erratic behavior (CI in GH Action does work)
# # - build-python38
# # - build-python39
# # - build-python310
# # - build-python311
3 changes: 2 additions & 1 deletion .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,8 @@ jobs:
#
# Related issue: https://github.com/actions/runner-images/issues/672.
# runs-on: ubuntu-latest
runs-on: macos-latest
# runs-on: macos-latest
runs-on: ubuntu-latest
strategy:
fail-fast: false
matrix:
Expand Down
6 changes: 6 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,12 @@ Versioned according to [Semantic Versioning](http://semver.org/).

## Unreleased

Changed:

* Port the processors to use the OCR-D/core v3 API, #44
* Spawn background processes for segmentation and recognition, #44
* Refactor tests and test under various conditions (w/o METS caching, page parallel processing), #44

## [0.4.1] - 2024-05-29

Fixed:
Expand Down
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@ docker:

# Run test
test: tests/assets
$(PYTHON) -m pytest tests $(PYTEST_ARGS)
$(PYTHON) -m pytest tests --durations=0 $(PYTEST_ARGS)

#
# Assets
Expand Down
119 changes: 47 additions & 72 deletions ocrd_kraken/binarize.py
Original file line number Diff line number Diff line change
@@ -1,26 +1,28 @@
from __future__ import absolute_import
import os
from os.path import join
from typing import Optional

import kraken.binarization

from ocrd.processor.base import OcrdPageResult
from ocrd.processor.ocrd_page_result import OcrdPageResultImage

from ocrd import Processor
from ocrd_utils import getLogger, make_file_id, MIMETYPE_PAGE
from ocrd_models.ocrd_page import AlternativeImageType, to_xml
from ocrd_utils import assert_file_grp_cardinality, getLogger, make_file_id, MIMETYPE_PAGE
from ocrd_models.ocrd_page import AlternativeImageType, OcrdPage, to_xml
from ocrd_modelfactory import page_from_file

from ocrd_kraken.config import OCRD_TOOL


class KrakenBinarize(Processor):

def __init__(self, *args, **kwargs):
kwargs['ocrd_tool'] = OCRD_TOOL['tools']['ocrd-kraken-binarize']
kwargs['version'] = OCRD_TOOL['version']
super(KrakenBinarize, self).__init__(*args, **kwargs)
@property
def executable(self):
return 'ocrd-kraken-binarize'

def process(self):
def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional[str] = None) -> OcrdPageResult:
"""Binarize the pages/regions/lines with Kraken.
Open and deserialise PAGE input files and their respective images,
then iterate over the element hierarchy down to the requested
Iterate over the input PAGE element hierarchy down to the requested
``level-of-operation``.
Next, for each file, crop each segment image according to the layout
Expand All @@ -36,64 +38,37 @@ def process(self):
Produce a new output file by serialising the resulting hierarchy.
"""
log = getLogger('processor.KrakenBinarize')
log.debug('Level of operation: "%s"', self.parameter['level-of-operation'])
log.debug('Input file group %s', self.input_file_grp)
log.debug('Input files %s', [str(f) for f in self.input_files])
for (n, input_file) in enumerate(self.input_files):
log.info("INPUT FILE %i / %s", n, input_file.pageId or input_file.ID)
file_id = make_file_id(input_file, self.output_file_grp)
pcgts = page_from_file(self.workspace.download_file(input_file))
page = pcgts.get_Page()
page_id = pcgts.pcGtsId or input_file.pageId or input_file.ID # (PageType has no id)
self.add_metadata(pcgts)
assert self.workspace
assert self.output_file_grp
self.logger.debug('Level of operation: "%s"', self.parameter['level-of-operation'])

page_image, page_coords, page_image_info = self.workspace.image_from_page(
page, page_id, feature_filter='binarized')
if self.parameter['level-of-operation'] == 'page':
log.info("Binarizing page '%s'", page_id)
bin_image = kraken.binarization.nlbin(page_image)
file_path = self.workspace.save_image_file(
bin_image, file_id + '.IMG-BIN',
self.output_file_grp,
page_id=input_file.pageId)
page.add_AlternativeImage(AlternativeImageType(
filename=file_path,
comments=page_coords['features'] + ',binarized'))
else:
for region in page.get_AllRegions(classes=['Text']):
region_image, region_coords = self.workspace.image_from_segment(
region, page_image, page_coords, feature_filter='binarized')
if self.parameter['level-of-operation'] == 'region':
log.info("Binarizing region '%s'", region.id)
bin_image = kraken.binarization.nlbin(region_image)
file_path = self.workspace.save_image_file(
bin_image, file_id + '_' + region.id + '.IMG-BIN',
self.output_file_grp,
page_id=input_file.pageId)
region.add_AlternativeImage(AlternativeImageType(
filename=file_path,
comments=region_coords['features'] + ',binarized'))
else:
for line in region.get_TextLine():
line_image, line_coords = self.workspace.image_from_segment(
line, region_image, region_coords, feature_filter='binarized')
log.info("Binarizing line '%s'", line.id)
bin_image = kraken.binarization.nlbin(line_image)
file_path = self.workspace.save_image_file(
bin_image, file_id + '_' + region.id + '_' + line.id + '.IMG-BIN',
self.output_file_grp,
page_id=input_file.pageId)
line.add_AlternativeImage(AlternativeImageType(
filename=file_path,
comments=line_coords['features'] + ',binarized'))
# update METS (add the PAGE file):
file_path = os.path.join(self.output_file_grp, file_id + '.xml')
pcgts.set_pcGtsId(file_id)
out = self.workspace.add_file(
ID=file_id,
file_grp=self.output_file_grp,
pageId=input_file.pageId,
local_filename=file_path,
mimetype=MIMETYPE_PAGE,
content=to_xml(pcgts))
pcgts = input_pcgts[0]
assert pcgts
page = pcgts.get_Page()
assert page
page_image, page_xywh, _ = self.workspace.image_from_page(
page, page_id, feature_filter='binarized')
result = OcrdPageResult(pcgts)
if self.parameter['level-of-operation'] == 'page':
self.logger.info("Binarizing page '%s'", page_id)
alternative_image = AlternativeImageType(comments=f'{page_xywh["features"]},binarized')
page.add_AlternativeImage(alternative_image)
result.images.append(OcrdPageResultImage(kraken.binarization.nlbin(page_image), '.IMG-BIN', alternative_image))
else:
for region in page.get_AllRegions(classes=['Text']):
region_image, region_xywh = self.workspace.image_from_segment(
region, page_image, page_xywh, feature_filter='binarized')
if self.parameter['level-of-operation'] == 'region':
self.logger.info("Binarizing region '%s'", region.id)
alternative_image = AlternativeImageType(comments=f'{region_xywh["features"]},binarized')
region.add_AlternativeImage(alternative_image)
result.images.append(OcrdPageResultImage(kraken.binarization.nlbin(region_image), f'{region.id}.IMG-BIN', alternative_image))
else:
for line in region.get_TextLine():
line_image, line_xywh = self.workspace.image_from_segment(
line, region_image, region_xywh, feature_filter='binarized')
self.logger.info("Binarizing line '%s'", line.id)
alternative_image = AlternativeImageType(comments=f'{line_xywh["features"]},binarized')
line.add_AlternativeImage(alternative_image)
result.images.append(OcrdPageResultImage(kraken.binarization.nlbin(line_image), f'{region.id}_{line.id}.IMG-BIN', alternative_image))
return result
10 changes: 0 additions & 10 deletions ocrd_kraken/cli.py

This file was deleted.

76 changes: 76 additions & 0 deletions ocrd_kraken/common.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
import multiprocessing as mp

from ocrd_utils import config, initLogging

class KrakenPredictor(mp.context.SpawnProcess):
def __init__(self, logger, parameter):
self.logger = logger
self.parameter = parameter
ctxt = mp.get_context('spawn')
self.taskq = ctxt.Queue(maxsize=1 + config.OCRD_MAX_PARALLEL_PAGES)
self.resultq = ctxt.Queue(maxsize=1 + config.OCRD_MAX_PARALLEL_PAGES)
self.terminate = ctxt.Event()
ctxt = mp.get_context('fork') # base.Processor will fork workers
self.results = ctxt.Manager().dict()
super().__init__()
self.daemon = True
def __call__(self, page_id, *page_input):
self.taskq.put((page_id, page_input))
self.logger.debug("sent task for '%s'", page_id)
#return self.get(page_id)
result = self.get(page_id)
self.logger.debug("received result for '%s'", page_id)
return result
def get(self, page_id):
while not self.terminate.is_set():
if page_id in self.results:
result = self.results.pop(page_id)
if isinstance(result, Exception):
raise Exception(f"predictor failed for {page_id}") from result
return result
try:
page_id, result = self.resultq.get(timeout=0.7)
except mp.queues.Empty:
continue
self.logger.debug("storing results for '%s'", page_id)
self.results[page_id] = result
raise Exception(f"predictor terminated while waiting on results for {page_id}")
def run(self):
initLogging()
try:
self.setup()
except Exception as e:
self.logger.exception("setup failed")
self.terminate.set()
while not self.terminate.is_set():
try:
page_id, page_input = self.taskq.get(timeout=1.1)
except mp.queues.Empty:
continue
self.logger.debug("predicting '%s'", page_id)
try:
page_output = self.predict(*page_input)
except Exception as e:
self.logger.error("prediction failed: %s", e.__class__.__name__)
page_output = e
self.resultq.put((page_id, page_output))
self.logger.debug("sent result for '%s'", page_id)
self.resultq.close()
self.resultq.cancel_join_thread()
self.logger.debug("predictor terminated")
def setup(self):
raise NotImplementedError()
def predict(self, *inputs):
raise NotImplementedError()
def shutdown(self):
# do not terminate from forked processor instances
if mp.parent_process() is None:
self.terminate.set()
self.taskq.close()
self.taskq.cancel_join_thread()
self.logger.debug(f"terminated {self} in {mp.current_process().name}")
else:
self.logger.debug(f"not touching {self} in {mp.current_process().name}")
def __del__(self):
self.logger.debug(f"deinit of {self} in {mp.current_process().name}")
self.shutdown()
5 changes: 0 additions & 5 deletions ocrd_kraken/config.py

This file was deleted.

12 changes: 6 additions & 6 deletions ocrd_kraken/ocrd-tool.json
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,8 @@
"tools": {
"ocrd-kraken-binarize": {
"executable": "ocrd-kraken-binarize",
"input_file_grp": ["OCR-D-IMG", "OCR-D-PRE-CROP", "OCR-D-SEG-REGION", "OCR-D-SEG-LINE"],
"output_file_grp": ["OCR-D-PRE-BIN"],
"input_file_grp_cardinality": 1,
"output_file_grp_cardinality": 1,
"categories": [
"Image preprocessing"
],
Expand All @@ -24,8 +24,8 @@
},
"ocrd-kraken-segment": {
"executable": "ocrd-kraken-segment",
"input_file_grp": ["OCR-D-IMG", "OCR-D-PRE-CROP", "OCR-D-PRE-BIN"],
"output_file_grp": ["OCR-D-SEG-REGION", "OCR-D-SEG-LINE"],
"input_file_grp_cardinality": 1,
"output_file_grp_cardinality": 1,
"categories": [
"Layout analysis"
],
Expand Down Expand Up @@ -128,8 +128,8 @@
},
"ocrd-kraken-recognize": {
"executable": "ocrd-kraken-recognize",
"input_file_grp": ["OCR-D-SEG-LINE"],
"output_file_grp": ["OCR-D-OCR-KRAK"],
"input_file_grp_cardinality": 1,
"output_file_grp_cardinality": 1,
"categories": ["Text recognition and optimization"],
"steps": ["recognition/text-recognition"],
"description": "Text recognition with Kraken",
Expand Down
Loading

0 comments on commit 65a88b6

Please sign in to comment.