diff --git a/.circleci/config.yml b/.circleci/config.yml index 2be7f23..bf0f83b 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -47,9 +47,10 @@ jobs: workflows: version: 2 - build: - jobs: - - build-python38 - - build-python39 - - build-python310 - - build-python311 + # build: + # jobs: + # # Disabled as of 2025-02-12 due to erratic behavior (CI in GH Action does work) + # # - build-python38 + # # - build-python39 + # # - build-python310 + # # - build-python311 diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 2c420e1..ed91176 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -15,7 +15,8 @@ jobs: # # Related issue: https://github.com/actions/runner-images/issues/672. # runs-on: ubuntu-latest - runs-on: macos-latest + # runs-on: macos-latest + runs-on: ubuntu-latest strategy: fail-fast: false matrix: diff --git a/CHANGELOG.md b/CHANGELOG.md index c0f6837..7962c27 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,12 @@ Versioned according to [Semantic Versioning](http://semver.org/). ## Unreleased +Changed: + + * Port the processors to use the OCR-D/core v3 API, #44 + * Spawn background processes for segmentation and recognition, #44 + * Refactor tests and test under various conditions (w/o METS caching, page parallel processing), #44 + ## [0.4.1] - 2024-05-29 Fixed: diff --git a/Makefile b/Makefile index 42a8f3d..c682693 100644 --- a/Makefile +++ b/Makefile @@ -71,7 +71,7 @@ docker: # Run test test: tests/assets - $(PYTHON) -m pytest tests $(PYTEST_ARGS) + $(PYTHON) -m pytest tests --durations=0 $(PYTEST_ARGS) # # Assets diff --git a/ocrd_kraken/binarize.py b/ocrd_kraken/binarize.py index 7c53940..8d639b8 100644 --- a/ocrd_kraken/binarize.py +++ b/ocrd_kraken/binarize.py @@ -1,26 +1,28 @@ from __future__ import absolute_import -import os +from os.path import join +from typing import Optional + import kraken.binarization + +from ocrd.processor.base import OcrdPageResult +from ocrd.processor.ocrd_page_result import OcrdPageResultImage + from ocrd import Processor -from ocrd_utils import getLogger, make_file_id, MIMETYPE_PAGE -from ocrd_models.ocrd_page import AlternativeImageType, to_xml +from ocrd_utils import assert_file_grp_cardinality, getLogger, make_file_id, MIMETYPE_PAGE +from ocrd_models.ocrd_page import AlternativeImageType, OcrdPage, to_xml from ocrd_modelfactory import page_from_file -from ocrd_kraken.config import OCRD_TOOL - class KrakenBinarize(Processor): - def __init__(self, *args, **kwargs): - kwargs['ocrd_tool'] = OCRD_TOOL['tools']['ocrd-kraken-binarize'] - kwargs['version'] = OCRD_TOOL['version'] - super(KrakenBinarize, self).__init__(*args, **kwargs) + @property + def executable(self): + return 'ocrd-kraken-binarize' - def process(self): + def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional[str] = None) -> OcrdPageResult: """Binarize the pages/regions/lines with Kraken. - Open and deserialise PAGE input files and their respective images, - then iterate over the element hierarchy down to the requested + Iterate over the input PAGE element hierarchy down to the requested ``level-of-operation``. Next, for each file, crop each segment image according to the layout @@ -36,64 +38,37 @@ def process(self): Produce a new output file by serialising the resulting hierarchy. """ - log = getLogger('processor.KrakenBinarize') - log.debug('Level of operation: "%s"', self.parameter['level-of-operation']) - log.debug('Input file group %s', self.input_file_grp) - log.debug('Input files %s', [str(f) for f in self.input_files]) - for (n, input_file) in enumerate(self.input_files): - log.info("INPUT FILE %i / %s", n, input_file.pageId or input_file.ID) - file_id = make_file_id(input_file, self.output_file_grp) - pcgts = page_from_file(self.workspace.download_file(input_file)) - page = pcgts.get_Page() - page_id = pcgts.pcGtsId or input_file.pageId or input_file.ID # (PageType has no id) - self.add_metadata(pcgts) + assert self.workspace + assert self.output_file_grp + self.logger.debug('Level of operation: "%s"', self.parameter['level-of-operation']) - page_image, page_coords, page_image_info = self.workspace.image_from_page( - page, page_id, feature_filter='binarized') - if self.parameter['level-of-operation'] == 'page': - log.info("Binarizing page '%s'", page_id) - bin_image = kraken.binarization.nlbin(page_image) - file_path = self.workspace.save_image_file( - bin_image, file_id + '.IMG-BIN', - self.output_file_grp, - page_id=input_file.pageId) - page.add_AlternativeImage(AlternativeImageType( - filename=file_path, - comments=page_coords['features'] + ',binarized')) - else: - for region in page.get_AllRegions(classes=['Text']): - region_image, region_coords = self.workspace.image_from_segment( - region, page_image, page_coords, feature_filter='binarized') - if self.parameter['level-of-operation'] == 'region': - log.info("Binarizing region '%s'", region.id) - bin_image = kraken.binarization.nlbin(region_image) - file_path = self.workspace.save_image_file( - bin_image, file_id + '_' + region.id + '.IMG-BIN', - self.output_file_grp, - page_id=input_file.pageId) - region.add_AlternativeImage(AlternativeImageType( - filename=file_path, - comments=region_coords['features'] + ',binarized')) - else: - for line in region.get_TextLine(): - line_image, line_coords = self.workspace.image_from_segment( - line, region_image, region_coords, feature_filter='binarized') - log.info("Binarizing line '%s'", line.id) - bin_image = kraken.binarization.nlbin(line_image) - file_path = self.workspace.save_image_file( - bin_image, file_id + '_' + region.id + '_' + line.id + '.IMG-BIN', - self.output_file_grp, - page_id=input_file.pageId) - line.add_AlternativeImage(AlternativeImageType( - filename=file_path, - comments=line_coords['features'] + ',binarized')) - # update METS (add the PAGE file): - file_path = os.path.join(self.output_file_grp, file_id + '.xml') - pcgts.set_pcGtsId(file_id) - out = self.workspace.add_file( - ID=file_id, - file_grp=self.output_file_grp, - pageId=input_file.pageId, - local_filename=file_path, - mimetype=MIMETYPE_PAGE, - content=to_xml(pcgts)) + pcgts = input_pcgts[0] + assert pcgts + page = pcgts.get_Page() + assert page + page_image, page_xywh, _ = self.workspace.image_from_page( + page, page_id, feature_filter='binarized') + result = OcrdPageResult(pcgts) + if self.parameter['level-of-operation'] == 'page': + self.logger.info("Binarizing page '%s'", page_id) + alternative_image = AlternativeImageType(comments=f'{page_xywh["features"]},binarized') + page.add_AlternativeImage(alternative_image) + result.images.append(OcrdPageResultImage(kraken.binarization.nlbin(page_image), '.IMG-BIN', alternative_image)) + else: + for region in page.get_AllRegions(classes=['Text']): + region_image, region_xywh = self.workspace.image_from_segment( + region, page_image, page_xywh, feature_filter='binarized') + if self.parameter['level-of-operation'] == 'region': + self.logger.info("Binarizing region '%s'", region.id) + alternative_image = AlternativeImageType(comments=f'{region_xywh["features"]},binarized') + region.add_AlternativeImage(alternative_image) + result.images.append(OcrdPageResultImage(kraken.binarization.nlbin(region_image), f'{region.id}.IMG-BIN', alternative_image)) + else: + for line in region.get_TextLine(): + line_image, line_xywh = self.workspace.image_from_segment( + line, region_image, region_xywh, feature_filter='binarized') + self.logger.info("Binarizing line '%s'", line.id) + alternative_image = AlternativeImageType(comments=f'{line_xywh["features"]},binarized') + line.add_AlternativeImage(alternative_image) + result.images.append(OcrdPageResultImage(kraken.binarization.nlbin(line_image), f'{region.id}_{line.id}.IMG-BIN', alternative_image)) + return result diff --git a/ocrd_kraken/cli.py b/ocrd_kraken/cli.py deleted file mode 100644 index ead681b..0000000 --- a/ocrd_kraken/cli.py +++ /dev/null @@ -1,10 +0,0 @@ -import click - -from ocrd.decorators import ocrd_cli_options, ocrd_cli_wrap_processor -from ocrd_kraken.binarize import KrakenBinarize - -@click.command() -@ocrd_cli_options -def ocrd_kraken_binarize(*args, **kwargs): - return ocrd_cli_wrap_processor(KrakenBinarize, *args, **kwargs) - diff --git a/ocrd_kraken/common.py b/ocrd_kraken/common.py new file mode 100644 index 0000000..b24d0ac --- /dev/null +++ b/ocrd_kraken/common.py @@ -0,0 +1,76 @@ +import multiprocessing as mp + +from ocrd_utils import config, initLogging + +class KrakenPredictor(mp.context.SpawnProcess): + def __init__(self, logger, parameter): + self.logger = logger + self.parameter = parameter + ctxt = mp.get_context('spawn') + self.taskq = ctxt.Queue(maxsize=1 + config.OCRD_MAX_PARALLEL_PAGES) + self.resultq = ctxt.Queue(maxsize=1 + config.OCRD_MAX_PARALLEL_PAGES) + self.terminate = ctxt.Event() + ctxt = mp.get_context('fork') # base.Processor will fork workers + self.results = ctxt.Manager().dict() + super().__init__() + self.daemon = True + def __call__(self, page_id, *page_input): + self.taskq.put((page_id, page_input)) + self.logger.debug("sent task for '%s'", page_id) + #return self.get(page_id) + result = self.get(page_id) + self.logger.debug("received result for '%s'", page_id) + return result + def get(self, page_id): + while not self.terminate.is_set(): + if page_id in self.results: + result = self.results.pop(page_id) + if isinstance(result, Exception): + raise Exception(f"predictor failed for {page_id}") from result + return result + try: + page_id, result = self.resultq.get(timeout=0.7) + except mp.queues.Empty: + continue + self.logger.debug("storing results for '%s'", page_id) + self.results[page_id] = result + raise Exception(f"predictor terminated while waiting on results for {page_id}") + def run(self): + initLogging() + try: + self.setup() + except Exception as e: + self.logger.exception("setup failed") + self.terminate.set() + while not self.terminate.is_set(): + try: + page_id, page_input = self.taskq.get(timeout=1.1) + except mp.queues.Empty: + continue + self.logger.debug("predicting '%s'", page_id) + try: + page_output = self.predict(*page_input) + except Exception as e: + self.logger.error("prediction failed: %s", e.__class__.__name__) + page_output = e + self.resultq.put((page_id, page_output)) + self.logger.debug("sent result for '%s'", page_id) + self.resultq.close() + self.resultq.cancel_join_thread() + self.logger.debug("predictor terminated") + def setup(self): + raise NotImplementedError() + def predict(self, *inputs): + raise NotImplementedError() + def shutdown(self): + # do not terminate from forked processor instances + if mp.parent_process() is None: + self.terminate.set() + self.taskq.close() + self.taskq.cancel_join_thread() + self.logger.debug(f"terminated {self} in {mp.current_process().name}") + else: + self.logger.debug(f"not touching {self} in {mp.current_process().name}") + def __del__(self): + self.logger.debug(f"deinit of {self} in {mp.current_process().name}") + self.shutdown() diff --git a/ocrd_kraken/config.py b/ocrd_kraken/config.py deleted file mode 100644 index 1816957..0000000 --- a/ocrd_kraken/config.py +++ /dev/null @@ -1,5 +0,0 @@ -import json -from ocrd_utils import resource_filename - -with open(resource_filename('ocrd_kraken', 'ocrd-tool.json'), 'r', encoding='utf-8') as f: - OCRD_TOOL = json.load(f) diff --git a/ocrd_kraken/ocrd-tool.json b/ocrd_kraken/ocrd-tool.json index beac9ab..576aaf7 100644 --- a/ocrd_kraken/ocrd-tool.json +++ b/ocrd_kraken/ocrd-tool.json @@ -4,8 +4,8 @@ "tools": { "ocrd-kraken-binarize": { "executable": "ocrd-kraken-binarize", - "input_file_grp": ["OCR-D-IMG", "OCR-D-PRE-CROP", "OCR-D-SEG-REGION", "OCR-D-SEG-LINE"], - "output_file_grp": ["OCR-D-PRE-BIN"], + "input_file_grp_cardinality": 1, + "output_file_grp_cardinality": 1, "categories": [ "Image preprocessing" ], @@ -24,8 +24,8 @@ }, "ocrd-kraken-segment": { "executable": "ocrd-kraken-segment", - "input_file_grp": ["OCR-D-IMG", "OCR-D-PRE-CROP", "OCR-D-PRE-BIN"], - "output_file_grp": ["OCR-D-SEG-REGION", "OCR-D-SEG-LINE"], + "input_file_grp_cardinality": 1, + "output_file_grp_cardinality": 1, "categories": [ "Layout analysis" ], @@ -128,8 +128,8 @@ }, "ocrd-kraken-recognize": { "executable": "ocrd-kraken-recognize", - "input_file_grp": ["OCR-D-SEG-LINE"], - "output_file_grp": ["OCR-D-OCR-KRAK"], + "input_file_grp_cardinality": 1, + "output_file_grp_cardinality": 1, "categories": ["Text recognition and optimization"], "steps": ["recognition/text-recognition"], "description": "Text recognition with Kraken", diff --git a/ocrd_kraken/recognize.py b/ocrd_kraken/recognize.py index 2e2ed1d..70c9ae2 100644 --- a/ocrd_kraken/recognize.py +++ b/ocrd_kraken/recognize.py @@ -1,6 +1,8 @@ -from os.path import join +from typing import Optional, Union +from ocrd.processor.base import OcrdPageResult import regex import itertools +from collections import defaultdict import numpy as np from scipy.sparse.csgraph import minimum_spanning_tree from shapely.geometry import Polygon, LineString, box as Rectangle @@ -8,9 +10,6 @@ from ocrd import Processor from ocrd_utils import ( - getLogger, - make_file_id, - assert_file_grp_cardinality, coordinates_of_segment, coordinates_for_segment, bbox_from_polygon, @@ -18,12 +17,10 @@ points_from_bbox, polygon_from_points, xywh_from_points, - bbox_from_points, transform_coordinates, - MIMETYPE_PAGE, ) -from ocrd_modelfactory import page_from_file from ocrd_models.ocrd_page import ( + OcrdPage, RegionRefType, RegionRefIndexedType, OrderedGroupType, @@ -35,52 +32,72 @@ WordType, GlyphType, CoordsType, - to_xml ) from ocrd_models.ocrd_page_generateds import ( ReadingDirectionSimpleType, TextLineOrderSimpleType ) -from ocrd_kraken.config import OCRD_TOOL - -class KrakenRecognize(Processor): - - def __init__(self, *args, **kwargs): - kwargs['ocrd_tool'] = OCRD_TOOL['tools']['ocrd-kraken-recognize'] - kwargs['version'] = OCRD_TOOL['version'] - super().__init__(*args, **kwargs) - if hasattr(self, 'output_file_grp'): - # processing context - self.setup() +from .common import KrakenPredictor +class KrakenRecognizePredictor(KrakenPredictor): + # workaround for Kraken's unpicklable defaultdict choice + class DefaultDict(defaultdict): + def __init__(self, default=None): + self.default = default + super().__init__() + def default_factory(self): + return self.default def setup(self): - """ - Load models - """ - log = getLogger('processor.KrakenRecognize') import torch - from kraken.rpred import rpred from kraken.lib.models import load_any - model_fname = self.resolve_resource(self.parameter['model']) - log.info("loading model '%s'", model_fname) + model = self.parameter['model'] + self.logger.info("loading model '%s'", model) device = self.parameter['device'] if device != 'cpu' and not torch.cuda.is_available(): device = 'cpu' if device == 'cpu': - log.warning("no CUDA device available. Running without GPU will be slow") - self.model = load_any(model_fname, device=device) - def predict(page_image, segmentation): - return rpred(self.model, page_image, segmentation, - self.parameter['pad'], - self.parameter['bidi_reordering']) - self.predict = predict - - def process(self): + self.logger.warning("no CUDA device available. Running without GPU will be slow") + self.model = load_any(model, device=device) + def predict(self, *inputs): + from kraken.rpred import mm_rpred + if not len(inputs): + return self.model.nn.input[1] == 1 and self.model.one_channel_mode == '1' + image, segmentation = inputs + nets = __class__.DefaultDict(self.model) + result = mm_rpred(nets, image, segmentation, + self.parameter['pad'], + self.parameter['bidi_reordering']) + # we must exhaust the generator before enqueuing + return list(result) + +class KrakenRecognize(Processor): + + @property + def executable(self): + return 'ocrd-kraken-recognize' + + def setup(self): + """ + Load model, set predict function + """ + parameter = dict(self.parameter) + parameter['model'] = self.resolve_resource(parameter['model']) + self.predictor = KrakenRecognizePredictor(self.logger, parameter) + self.predictor.start() + self.binary = self.predictor("") # blocks until model is loaded + self.logger.info("loaded %s model %s", "binary" if self.binary else "grayscale", self.parameter["model"]) + + def shutdown(self): + if getattr(self, 'predictor', None): + self.predictor.shutdown() + del self.predictor + + def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional[str] = None) -> OcrdPageResult: """Recognize text on lines with Kraken. - Open and deserialise each PAGE input file and its respective image, - then iterate over the element hierarchy down to the line level. + Open the parsed PAGE-XML file, then iterate over the element hierarchy + down to the line level. Set up Kraken to recognise each text line (via coordinates into the higher-level image, or from the alternative image. If the model @@ -94,149 +111,136 @@ def process(self): into additional TextEquiv at each level, and make the higher levels consistent with that (by concatenation joined by whitespace). - Produce a new output file by serialising the resulting hierarchy. + Return the resulting hierarchy. """ + assert self.workspace from kraken.containers import Segmentation, BaselineLine, BBoxLine - log = getLogger('processor.KrakenRecognize') - assert_file_grp_cardinality(self.input_file_grp, 1) - assert_file_grp_cardinality(self.output_file_grp, 1) - - for n, input_file in enumerate(self.input_files): - page_id = input_file.pageId or input_file.ID - log.info("INPUT FILE %i / %s of %s", n, page_id, len(self.input_files)) - pcgts = page_from_file(self.workspace.download_file(input_file)) - self.add_metadata(pcgts) - page = pcgts.get_Page() - page_image, page_coords, _ = self.workspace.image_from_page( - page, page_id, - feature_selector="binarized" - if self.model.nn.input[1] == 1 and self.model.one_channel_mode == '1' - else '') - page_rect = Rectangle(0, 0, page_image.width - 1, page_image.height - 1) - # todo: find out whether kraken.lib.xml.XMLPage(...).to_container() is adequate - - all_lines = page.get_AllTextLines() - # assumes that missing baselines are rare, if any - if any(line.Baseline for line in all_lines): - log.info("Converting PAGE to Kraken Segmentation (baselines)") - segtype = 'baselines' - else: - log.info("Converting PAGE to Kraken Segmentation (boxes only)") - segtype = 'bbox' - scale = 0.5 * np.median([xywh_from_points(line.Coords.points)['h'] for line in all_lines]) - log.info("Estimated scale: %.1f", scale) - seglines = [] - for line in all_lines: - # FIXME: see whether model prefers baselines or bbox crops (seg_type) - # FIXME: even if we do not have baselines, emulating baseline+boundary might be useful to prevent automatic center normalization - poly = coordinates_of_segment(line, None, page_coords) - poly = make_valid(Polygon(poly)) - poly = poly.intersection(page_rect) - if segtype == 'baselines': - if line.Baseline is None: + + pcgts = input_pcgts[0] + assert pcgts + page = pcgts.get_Page() + assert page + page_image, page_coords, _ = self.workspace.image_from_page( + page, page_id, + feature_selector="binarized" + if self.binary else '') + page_rect = Rectangle(0, 0, page_image.width - 1, page_image.height - 1) + # TODO: find out whether kraken.lib.xml.XMLPage(...).to_container() is adequate + + all_lines = page.get_AllTextLines() + # assumes that missing baselines are rare, if any + if any(line.Baseline for line in all_lines): + self.logger.info("Converting PAGE to Kraken Segmentation (baselines)") + segtype = 'baselines' + else: + self.logger.info("Converting PAGE to Kraken Segmentation (boxes only)") + segtype = 'bbox' + scale = 0.5 * np.median([xywh_from_points(line.Coords.points)['h'] for line in all_lines]) + self.logger.info("Estimated scale: %.1f", scale) + seglines = [] + for line in all_lines: + # FIXME: see whether model prefers baselines or bbox crops (seg_type) + # FIXME: even if we do not have baselines, emulating baseline+boundary might be useful to prevent automatic center normalization + poly = coordinates_of_segment(line, None, page_coords) + poly = make_valid(Polygon(poly)) + poly = poly.intersection(page_rect) + if segtype == 'baselines': + if line.Baseline is None: + base = dummy_baseline_of_segment(line, page_coords) + else: + base = baseline_of_segment(line, page_coords) + if len(base) < 2 or np.abs(np.mean(base[0] - base[-1])) <= 1: + base = dummy_baseline_of_segment(line, page_coords) + elif not LineString(base).intersects(poly): base = dummy_baseline_of_segment(line, page_coords) - else: - base = baseline_of_segment(line, page_coords) - if len(base) < 2 or np.abs(np.mean(base[0] - base[-1])) <= 1: - base = dummy_baseline_of_segment(line, page_coords) - elif not LineString(base).intersects(poly): - base = dummy_baseline_of_segment(line, page_coords) - # kraken expects baseline to be fully contained in boundary - base = LineString(base) - if poly.is_empty: - poly = polygon_from_baseline(base, scale=scale) - elif not base.within(poly): - poly = join_polygons([poly, polygon_from_baseline(base, scale=scale)], - loc=line.id, scale=scale) - seglines.append(BaselineLine(baseline=list(map(tuple, base.coords)), - boundary=list(map(tuple, poly.exterior.coords)), - id=line.id, - tags={'type': 'default'})) - # write back - base = coordinates_for_segment(base.coords, None, page_coords) - line.set_Baseline(BaselineType(points=points_from_polygon(base))) - poly = coordinates_for_segment(poly.exterior.coords[:-1], None, page_coords) - line.set_Coords(CoordsType(points=points_from_polygon(poly))) + # kraken expects baseline to be fully contained in boundary + base = LineString(base) + if poly.is_empty: + poly = polygon_from_baseline(base, scale=scale) + elif not base.within(poly): + poly = join_polygons([poly, polygon_from_baseline(base, scale=scale)], + loc=line.id, scale=scale) + seglines.append(BaselineLine(baseline=list(map(tuple, base.coords)), + boundary=list(map(tuple, poly.exterior.coords)), + id=line.id, + tags={'type': 'default'})) + # write back + base = coordinates_for_segment(base.coords, None, page_coords) + line.set_Baseline(BaselineType(points=points_from_polygon(base))) + poly = coordinates_for_segment(poly.exterior.coords[:-1], None, page_coords) + line.set_Coords(CoordsType(points=points_from_polygon(poly))) + else: + seglines.append(BBoxLine(bbox=poly.envelope.bounds, + id=line.id)) + + segmentation = Segmentation(lines=seglines, + script_detection=False, + text_direction='horizontal-lr', + type=segtype, + imagename=page_id) + for idx_line, ocr_record in enumerate(self.predictor(page_id, page_image, segmentation)): + line = all_lines[idx_line] + id_line = line.id + if not ocr_record.prediction and not ocr_record.cuts: + self.logger.warning('No results for line "%s"', line.id) + continue + text_line = ocr_record.prediction + if len(ocr_record.confidences) > 0: + conf_line = sum(ocr_record.confidences) / len(ocr_record.confidences) + else: + conf_line = None + if self.parameter['overwrite_text']: + line.TextEquiv = [] + line.add_TextEquiv(TextEquivType(Unicode=text_line, conf=conf_line)) + idx_word = 0 + line_offset = 0 + for text_word in regex.splititer(r'(\s+)', text_line): + next_offset = line_offset + len(text_word) + cuts_word = list(map(list, ocr_record.cuts[line_offset:next_offset])) + # fixme: kraken#98 says the Pytorch CTC output is too impoverished to yield good glyph stops + # as a workaround, here we just steal from the next glyph start, respectively: + if len(ocr_record.cuts) > next_offset + 1: + cuts_word.extend(list(map(list, ocr_record.cuts[next_offset:next_offset+1]))) else: - seglines.append(BBoxLine(bbox=poly.envelope.bounds, - id=line.id)) - - segmentation = Segmentation(lines=seglines, - script_detection=False, - text_direction='horizontal-lr', - type=segtype, - imagename=page_id) - for idx_line, ocr_record in enumerate(self.predict(page_image, segmentation)): - line = all_lines[idx_line] - id_line = line.id - if not ocr_record.prediction and not ocr_record.cuts: - log.warning('No results for line "%s"', line.id) + cuts_word.append(list(ocr_record.cuts[-1])) + confidences_word = ocr_record.confidences[line_offset:next_offset] + line_offset = next_offset + if len(text_word.strip()) == 0: continue - text_line = ocr_record.prediction - if len(ocr_record.confidences) > 0: - conf_line = sum(ocr_record.confidences) / len(ocr_record.confidences) + id_word = '%s_word_%s' % (id_line, idx_word + 1) + idx_word += 1 + poly_word = [point for cut in cuts_word for point in cut] + bbox_word = bbox_from_polygon(coordinates_for_segment(poly_word, None, page_coords)) + # avoid zero-size coords on ties + bbox_word = np.array(bbox_word, dtype=int) + if np.prod(bbox_word[2:4] - bbox_word[0:2]) == 0: + bbox_word[2:4] += 1 + if len(confidences_word) > 0: + conf_word = sum(confidences_word) / len(confidences_word) else: - conf_line = None - if self.parameter['overwrite_text']: - line.TextEquiv = [] - line.add_TextEquiv(TextEquivType(Unicode=text_line, conf=conf_line)) - idx_word = 0 - line_offset = 0 - for text_word in regex.splititer(r'(\s+)', text_line): - next_offset = line_offset + len(text_word) - cuts_word = list(map(list, ocr_record.cuts[line_offset:next_offset])) - # fixme: kraken#98 says the Pytorch CTC output is too impoverished to yield good glyph stops - # as a workaround, here we just steal from the next glyph start, respectively: - if len(ocr_record.cuts) > next_offset + 1: - cuts_word.extend(list(map(list, ocr_record.cuts[next_offset:next_offset+1]))) - else: - cuts_word.append(list(ocr_record.cuts[-1])) - confidences_word = ocr_record.confidences[line_offset:next_offset] - line_offset = next_offset - if len(text_word.strip()) == 0: - continue - id_word = '%s_word_%s' % (id_line, idx_word + 1) - idx_word += 1 - poly_word = [point for cut in cuts_word for point in cut] - bbox_word = bbox_from_polygon(coordinates_for_segment(poly_word, None, page_coords)) + conf_word = None + word = WordType(id=id_word, + Coords=CoordsType(points=points_from_bbox(*bbox_word))) + word.add_TextEquiv(TextEquivType(Unicode=text_word, conf=conf_word)) + for idx_glyph, text_glyph in enumerate(text_word): + id_glyph = '%s_glyph_%s' % (id_word, idx_glyph + 1) + poly_glyph = cuts_word[idx_glyph] + cuts_word[idx_glyph + 1] + bbox_glyph = bbox_from_polygon(coordinates_for_segment(poly_glyph, None, page_coords)) # avoid zero-size coords on ties - bbox_word = np.array(bbox_word, dtype=int) - if np.prod(bbox_word[2:4] - bbox_word[0:2]) == 0: - bbox_word[2:4] += 1 - if len(confidences_word) > 0: - conf_word = sum(confidences_word) / len(confidences_word) - else: - conf_word = None - word = WordType(id=id_word, - Coords=CoordsType(points=points_from_bbox(*bbox_word))) - word.add_TextEquiv(TextEquivType(Unicode=text_word, conf=conf_word)) - for idx_glyph, text_glyph in enumerate(text_word): - id_glyph = '%s_glyph_%s' % (id_word, idx_glyph + 1) - poly_glyph = cuts_word[idx_glyph] + cuts_word[idx_glyph + 1] - bbox_glyph = bbox_from_polygon(coordinates_for_segment(poly_glyph, None, page_coords)) - # avoid zero-size coords on ties - bbox_glyph = np.array(bbox_glyph, dtype=int) - if np.prod(bbox_glyph[2:4] - bbox_glyph[0:2]) == 0: - bbox_glyph[2:4] += 1 - conf_glyph = confidences_word[idx_glyph] - glyph = GlyphType(id=id_glyph, - Coords=CoordsType(points=points_from_bbox(*bbox_glyph))) - glyph.add_TextEquiv(TextEquivType(Unicode=text_glyph, conf=conf_glyph)) - word.add_Glyph(glyph) - line.add_Word(word) - log.info('Recognized line "%s"', line.id) + bbox_glyph = np.array(bbox_glyph, dtype=int) + if np.prod(bbox_glyph[2:4] - bbox_glyph[0:2]) == 0: + bbox_glyph[2:4] += 1 + conf_glyph = confidences_word[idx_glyph] + glyph = GlyphType(id=id_glyph, + Coords=CoordsType(points=points_from_bbox(*bbox_glyph))) + glyph.add_TextEquiv(TextEquivType(Unicode=text_glyph, conf=conf_glyph)) + word.add_Glyph(glyph) + line.add_Word(word) + self.logger.info('Recognized line "%s"', line.id) page_update_higher_textequiv_levels('line', pcgts) - log.info("Finished recognition, serializing") - file_id = make_file_id(input_file, self.output_file_grp) - pcgts.set_pcGtsId(file_id) - self.workspace.add_file( - ID=file_id, - file_grp=self.output_file_grp, - pageId=input_file.pageId, - mimetype=MIMETYPE_PAGE, - local_filename=join(self.output_file_grp, f'{file_id}.xml'), - content=to_xml(pcgts)) + self.logger.info("Finished recognition, serializing") + return OcrdPageResult(pcgts) # zzz should go into core ocrd_utils def baseline_of_segment(segment, coords): @@ -251,7 +255,7 @@ def dummy_baseline_of_segment(segment, coords, yrel=0.2): return [[xmin, ymid], [xmax, ymid]] # zzz should go into core ocrd_utils -def polygon_from_baseline(baseline, scale=20): +def polygon_from_baseline(baseline, scale : Union[float, np.floating] = 20): if not isinstance(baseline, LineString): baseline = LineString(baseline) ltr = baseline.coords[0][0] < baseline.coords[-1][0] @@ -261,7 +265,7 @@ def polygon_from_baseline(baseline, scale=20): scale=scale)) return polygon -def join_polygons(polygons, loc='', scale=20): +def join_polygons(polygons, loc='', scale : Union[float, np.floating] = 20): """construct concave hull (alpha shape) from input polygons""" # compoundp = unary_union(polygons) # jointp = compoundp.convex_hull diff --git a/ocrd_kraken/segment.py b/ocrd_kraken/segment.py index 14e19dc..9d4eab3 100644 --- a/ocrd_kraken/segment.py +++ b/ocrd_kraken/segment.py @@ -1,83 +1,97 @@ +from typing import Optional from PIL import ImageOps -from os.path import join + +import shapely.geometry as geom +from shapely.prepared import prep as geom_prep +import torch from ocrd import Processor +from ocrd.processor.ocrd_page_result import OcrdPageResult from ocrd_utils import ( getLogger, - assert_file_grp_cardinality, - make_file_id, - concat_padded, polygon_from_x0y0x1y1, points_from_polygon, polygon_mask, coordinates_for_segment, coordinates_of_segment, - MIMETYPE_PAGE ) import ocrd_models.ocrd_page from ocrd_models.ocrd_page import ( + OcrdPage, PageType, BorderType, TextRegionType, TextLineType, CoordsType, BaselineType, - to_xml ) -from ocrd_modelfactory import page_from_file - -import shapely.geometry as geom -from shapely.prepared import prep as geom_prep -import torch - -from .config import OCRD_TOOL -class KrakenSegment(Processor): - - def __init__(self, *args, **kwargs): - kwargs['ocrd_tool'] = OCRD_TOOL['tools']['ocrd-kraken-segment'] - kwargs['version'] = OCRD_TOOL['version'] - super().__init__(*args, **kwargs) - if hasattr(self, 'output_file_grp'): - # processing context - self.setup() +from .common import KrakenPredictor +class KrakenSegmentPredictor(KrakenPredictor): def setup(self): - """ - Load models - """ - self.log = getLogger('processor.KrakenSegment') - kwargs = {} - kwargs['text_direction'] = self.parameter['text_direction'] - self.use_legacy = self.parameter['use_legacy'] + self.use_legacy = self.parameter.pop('use_legacy') if self.use_legacy: - from kraken.pageseg import segment - kwargs['scale'] = self.parameter['scale'] - kwargs['maxcolseps'] = self.parameter['maxcolseps'] - kwargs['black_colseps'] = self.parameter['black_colseps'] - self.log.info("Using legacy segmenter") + self.logger.info("Using legacy segmenter") + # adapt to Kraken v5 changes: + self.parameter['no_hlines'] = self.parameter.pop('remove_hlines') + self.parameter.pop('device') else: from kraken.lib.vgsl import TorchVGSLModel - from kraken.blla import segment - self.log.info("Using blla segmenter") - blla_model_fname = self.resolve_resource(self.parameter['blla_model']) - kwargs['model'] = TorchVGSLModel.load_model(blla_model_fname) + self.logger.info("Using blla segmenter") + self.logger.info("loading model '%s'", self.parameter['model']) + self.parameter['model'] = TorchVGSLModel.load_model(self.parameter['model']) device = self.parameter['device'] if device != 'cpu' and not torch.cuda.is_available(): device = 'cpu' if device == 'cpu': - self.log.warning("no CUDA device available. Running without GPU will be slow") - kwargs['device'] = device - def segmenter(img, mask=None): - return segment(img, mask=mask, **kwargs) - self.segmenter = segmenter + self.logger.warning("no CUDA device available. Running without GPU will be slow") + self.parameter['device'] = device + # adapt to Kraken v5 changes: + self.parameter.pop('scale') + self.parameter.pop('remove_hlines') + self.parameter.pop('maxcolseps') + self.parameter.pop('black_colseps') + def predict(self, *inputs): + if self.use_legacy: + from kraken.pageseg import segment + else: + from kraken.blla import segment + image, mask = inputs + return segment(image, mask=mask, **self.parameter) + +class KrakenSegment(Processor): + + @property + def executable(self): + return 'ocrd-kraken-segment' + + def setup(self): + """ + Load models + """ + parameter = dict(self.parameter) + model = parameter.pop('blla_model') + del parameter['blla_classes'] + del parameter['overwrite_segments'] + del parameter['level-of-operation'] + self.use_legacy = parameter['use_legacy'] + if not self.use_legacy: + parameter['model'] = self.resolve_resource(model) + self.predictor = KrakenSegmentPredictor(self.logger, parameter) + self.predictor.start() + + def shutdown(self): + import multiprocessing as mp + if getattr(self, 'predictor', None): + self.predictor.shutdown() + del self.predictor - def process(self): + def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional[str] = None) -> OcrdPageResult: """Segment into (regions and) lines with Kraken. - Open and deserialise PAGE input files and their respective images, - then iterate over the element hierarchy down to the ``level-of-operation``, - i.e.: + Iterate over the element hierarchy of the PAGE-XML down to the + ``level-of-operation``, i.e.: \b - On `page` level and `table` level, detect text regions and lines @@ -96,70 +110,58 @@ def process(self): Then compute a segmentation and decode it into new (text regions and) lines, and append them to the parent segment. - Produce a new output file by serialising the resulting hierarchy. + Return the resulting hierarchy. """ - assert_file_grp_cardinality(self.input_file_grp, 1) - assert_file_grp_cardinality(self.output_file_grp, 1) - for n, input_file in enumerate(self.input_files): - page_id = input_file.pageId or input_file.ID - self.log.info("INPUT FILE %i / %s of %s", n, page_id, len(self.input_files)) - pcgts = page_from_file(self.workspace.download_file(input_file)) - self.add_metadata(pcgts) - page = pcgts.get_Page() - page_image, page_coords, page_info = self.workspace.image_from_page( - page, page_id, - feature_selector="binarized" if self.use_legacy else "") - if page_info.resolution != 1: - dpi = page_info.resolution - if page_info.resolutionUnit == 'cm': - dpi = round(dpi * 2.54) - zoom = 300.0 / dpi - else: - zoom = 1.0 - # TODO: be DPI-relative + pcgts = input_pcgts[0] + assert pcgts + page = pcgts.get_Page() + assert page + page_image, page_coords, page_info = self.workspace.image_from_page( + page, page_id, + feature_selector="binarized" if self.use_legacy else "") + if page_info.resolution != 1: + dpi = page_info.resolution + if page_info.resolutionUnit == 'cm': + dpi = round(dpi * 2.54) + zoom = 300.0 / dpi + else: + zoom = 1.0 + # TODO: be DPI-relative - if self.parameter['level-of-operation'] == 'page': - self.log.info('Segmenting page with %s segmenter', 'legacy' if self.use_legacy else 'blla') + if self.parameter['level-of-operation'] == 'page': + self.logger.info('Segmenting page with %s segmenter', 'legacy' if self.use_legacy else 'blla') + if self.parameter['overwrite_segments']: + page.TextRegion = [] + elif len(page.TextRegion or []): + self.logger.warning('Keeping %d text regions on page "%s"', len(page.TextRegion or []), page.id) + self._process_page(page_image, page_coords, page, page_id, zoom) + elif self.parameter['level-of-operation'] == 'table': + regions = page.get_AllRegions(classes=['Table']) + if not regions: + self.logger.warning('No existing table regions on page "%s"', page_id) + for region in regions: + self.logger.info('Segmenting table region "%s" with %s segmenter', region.id, 'legacy' if self.use_legacy else 'blla') if self.parameter['overwrite_segments']: - page.TextRegion = [] - elif len(page.TextRegion or []): - self.log.warning('Keeping %d text regions on page "%s"', len(page.TextRegion or []), page.id) - self._process_page(page_image, page_coords, page, zoom) - elif self.parameter['level-of-operation'] == 'table': - regions = page.get_AllRegions(classes=['Table']) - if not regions: - self.log.warning('No existing table regions on page "%s"', page_id) - for region in regions: - self.log.info('Segmenting table region "%s" with %s segmenter', region.id, 'legacy' if self.use_legacy else 'blla') - if self.parameter['overwrite_segments']: - region.TextRegion = [] - elif len(region.TextRegion or []): - self.log.warning('Keeping %d text regions in region "%s"', len(region.TextRegion or []), region.id) - self._process_page(page_image, page_coords, region, zoom) - else: - regions = page.get_AllRegions(classes=['Text']) - if not regions: - self.log.warning('No existing text regions on page "%s"', page_id) - for region in regions: - self.log.info('Segmenting text region "%s" with %s segmenter', region.id, 'legacy' if self.use_legacy else 'blla') - if self.parameter['overwrite_segments']: - region.TextLine = [] - elif len(region.TextLine or []): - self.log.warning('Keeping %d lines in region "%s"', len(region.TextLine or []), region.id) - self._process_region(page_image, page_coords, region, zoom) + region.TextRegion = [] + elif len(region.TextRegion or []): + self.logger.warning('Keeping %d text regions in region "%s"', len(region.TextRegion or []), region.id) + self._process_page(page_image, page_coords, region, page_id, zoom) + else: + regions = page.get_AllRegions(classes=['Text']) + if not regions: + self.logger.warning('No existing text regions on page "%s"', page_id) + for region in regions: + self.logger.info('Segmenting text region "%s" with %s segmenter', region.id, 'legacy' if self.use_legacy else 'blla') + if self.parameter['overwrite_segments']: + region.TextLine = [] + elif len(region.TextLine or []): + self.logger.warning('Keeping %d lines in region "%s"', len(region.TextLine or []), region.id) + self._process_region(page_image, page_coords, region, page_id, zoom) - file_id = make_file_id(input_file, self.output_file_grp) - pcgts.set_pcGtsId(file_id) - self.workspace.add_file( - ID=file_id, - file_grp=self.output_file_grp, - pageId=input_file.pageId, - mimetype=MIMETYPE_PAGE, - local_filename=join(self.output_file_grp, f'{file_id}.xml'), - content=to_xml(pcgts)) + return OcrdPageResult(pcgts) - def _process_page(self, page_image, page_coords, page, zoom=1.0): + def _process_page(self, page_image, page_coords, page, page_id, zoom=1.0): def getmask(): # use mask if existing regions (any type for page, text cells for table) # or segment is lower than page level @@ -192,15 +194,15 @@ def getmask(): # poly = geom.Polygon(poly).buffer(20/zoom).exterior.coords[:-1] mask = ImageOps.invert(polygon_mask(page_image, poly)) for region in regions: - self.log.info("Masking existing region %s", region.id) + self.logger.info("Masking existing region %s", region.id) poly = coordinates_of_segment(region, page_image, page_coords) # poly = geom.Polygon(poly).buffer(20/zoom).exterior.coords[:-1] mask.paste(255, mask=polygon_mask(page_image, poly)) return mask - res = self.segmenter(page_image, mask=getmask()) - self.log.debug("Finished segmentation, serializing") + res = self.predictor(page_id, page_image, getmask()) + self.logger.debug("Finished segmentation, serializing") + #self.logger.debug(res) if self.use_legacy: - self.log.debug(res) idx_line = 0 for idx_line, line in enumerate(res.lines): line_poly = polygon_from_x0y0x1y1(line.bbox) @@ -213,9 +215,8 @@ def getmask(): id=f'region_line_{idx_line + 1}_line', Coords=CoordsType(points=line_points))) page.add_TextRegion(region_elem) - self.log.debug("Found %d lines on page %s", idx_line + 1, page.id) + self.logger.debug("Found %d lines on page %s", idx_line + 1, page.id) else: - self.log.debug(res) handled_lines = {} regions = [(type_, region) for type_ in res.regions @@ -239,11 +240,11 @@ def getmask(): line_baseline = coordinates_for_segment(line.baseline, None, page_coords) line_id = f'region_{idx_region + 1}_line_{idx_line + 1}' line_type = line.tags.get('type', '') - self.log.info("Line %s is of type %s", line_id, line_type) + self.logger.info("Line %s is of type %s", line_id, line_type) line_poly = make_valid(geom.Polygon(line_poly)) if region_poly.contains(line_poly): if idx_line in handled_lines: - self.log.error("Line %s was already added to region %s" % (idx_line, handled_lines[idx_line])) + self.logger.error("Line %s was already added to region %s" % (idx_line, handled_lines[idx_line])) continue region_elem.add_TextLine(TextLineType( id=line_id, @@ -252,12 +253,12 @@ def getmask(): handled_lines[idx_line] = idx_region for idx_line, line in enumerate(res.lines): if idx_line not in handled_lines: - self.log.error("Line %s could not be assigned a region, creating a dummy region", idx_line) + self.logger.error("Line %s could not be assigned a region, creating a dummy region", idx_line) line_poly = coordinates_for_segment(line.boundary, None, page_coords) line_baseline = coordinates_for_segment(line.baseline, None, page_coords) line_id = f'region_line_{idx_line + 1}_line' line_type = line.tags.get('type', '') - self.log.info("Line %s is of type %s", line_id, line_type) + self.logger.info("Line %s is of type %s", line_id, line_type) line_poly = make_valid(geom.Polygon(line_poly)).exterior.coords[:-1] region_elem = TextRegionType( id='region_line_%s' % (idx_line + 1), @@ -267,21 +268,21 @@ def getmask(): Baseline=BaselineType(points=points_from_polygon(line_baseline)), Coords=CoordsType(points=points_from_polygon(line_poly)))) page.add_TextRegion(region_elem) - self.log.debug("Found %d lines and %d regions on page %s", idx_line + 1, idx_region + 1, page.id) + self.logger.debug("Found %d lines and %d regions on page %s", idx_line + 1, idx_region + 1, page.id) - def _process_region(self, page_image, page_coords, region, zoom=1.0): + def _process_region(self, page_image, page_coords, region, page_id, zoom=1.0): def getmask(): poly = coordinates_of_segment(region, page_image, page_coords) poly = geom.Polygon(poly).buffer(20/zoom).exterior.coords[:-1] mask = ImageOps.invert(polygon_mask(page_image, poly)) for line in region.TextLine: - self.log.info("Masking existing line %s", line.id) + self.logger.info("Masking existing line %s", line.id) poly = coordinates_of_segment(line, page_image, page_coords) # poly = geom.Polygon(poly).buffer(20/zoom).exterior.coords[:-1] mask.paste(255, mask=polygon_mask(page_image, poly)) return mask - res = self.segmenter(page_image, mask=getmask()) - self.log.debug("Finished segmentation, serializing") + res = self.predictor(page_id, page_image, getmask()) + self.logger.debug("Finished segmentation, serializing") idx_line = 0 if self.use_legacy: for idx_line, line in enumerate(res.lines): @@ -297,7 +298,7 @@ def getmask(): line_baseline = coordinates_for_segment(line.baseline, None, page_coords) line_id = f'{region.id}_line_{idx_line + 1}' line_type = line.tags.get('type', '') - self.log.info("Line %s is of type %s", line_id, line_type) + self.logger.info("Line %s is of type %s", line_id, line_type) line_poly = geom.Polygon(line_poly) #line_poly = line_poly.intersection(region_poly) line_poly = make_valid(line_poly).exterior.coords[:-1] @@ -305,7 +306,7 @@ def getmask(): id=line_id, Baseline=BaselineType(points=points_from_polygon(line_baseline)), Coords=CoordsType(points=points_from_polygon(line_poly)))) - self.log.debug("Found %d lines in region %s", idx_line + 1, region.id) + self.logger.debug("Found %d lines in region %s", idx_line + 1, region.id) def make_valid(polygon): for split in range(1, len(polygon.exterior.coords)-1): diff --git a/requirements.txt b/requirements.txt index 6bbfb40..d6faf5f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,6 @@ -ocrd >= 2.65 +ocrd >= 3.0.2 kraken >= 5.0 scipy shapely +regex diff --git a/tests/base.py b/tests/base.py deleted file mode 100644 index 1387769..0000000 --- a/tests/base.py +++ /dev/null @@ -1,89 +0,0 @@ -# pylint: disable=unused-import - -from os.path import dirname, realpath -from os import chdir -import sys -import logging -import io -import collections -from unittest import TestCase as VanillaTestCase, skip, main as unittests_main -import pytest -from ocrd_utils import disableLogging, initLogging - -from tests.assets import assets, copy_of_directory - - -def main(fn=None): - if fn: - sys.exit(pytest.main([fn])) - else: - unittests_main() - - -class TestCase(VanillaTestCase): - - @classmethod - def setUpClass(cls): - chdir(dirname(realpath(__file__)) + '/..') - - def setUp(self): - disableLogging() - initLogging() - -class CapturingTestCase(TestCase): - """ - A TestCase that needs to capture stderr/stdout and invoke click CLI. - """ - - @pytest.fixture(autouse=True) - def _setup_pytest_capfd(self, capfd): - self.capfd = capfd - - def invoke_cli(self, cli, args): - """ - Substitution for click.CliRunner.invooke that works together nicely - with unittests/pytest capturing stdout/stderr. - """ - self.capture_out_err() # XXX snapshot just before executing the CLI - code = 0 - sys.argv[1:] = args # XXX necessary because sys.argv reflects pytest args not cli args - try: - cli.main(args=args) - except SystemExit as e: - code = e.code - out, err = self.capture_out_err() - return code, out, err - - def capture_out_err(self): - return self.capfd.readouterr() - -# import traceback -# import warnings -# def warn_with_traceback(message, category, filename, lineno, file=None, line=None): -# log = file if hasattr(file, 'write') else sys.stderr -# traceback.print_stack(file=log) -# log.write(warnings.formatwarning(message, category, filename, lineno, line)) -# warnings.showwarning = warn_with_traceback - -# https://stackoverflow.com/questions/37944111/python-rolling-log-to-a-variable -# Adapted from http://alanwsmith.com/capturing-python-log-output-in-a-variable - -class FIFOIO(io.TextIOBase): - def __init__(self, size, *args): - self.maxsize = size - io.TextIOBase.__init__(self, *args) - self.deque = collections.deque() - def getvalue(self): - return ''.join(self.deque) - def write(self, x): - self.deque.append(x) - self.shrink() - def shrink(self): - if self.maxsize is None: - return - size = sum(len(x) for x in self.deque) - while size > self.maxsize: - x = self.deque.popleft() - size -= len(x) - -sys.path.append(dirname(realpath(__file__)) + '/../ocrd') diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 0000000..0d7eee5 --- /dev/null +++ b/tests/conftest.py @@ -0,0 +1,63 @@ +# pylint: disable=unused-import + +from multiprocessing import Process +from time import sleep +import pytest + +from ocrd import Resolver, Workspace, OcrdMetsServer +from ocrd_utils import pushd_popd, disableLogging, initLogging, setOverrideLogLevel, config + +from .assets import assets + +CONFIGS = ['', 'pageparallel', 'metscache', 'pageparallel+metscache'] + +@pytest.fixture(params=CONFIGS) +def workspace(tmpdir, pytestconfig, request): + def _make_workspace(workspace_path): + initLogging() + if pytestconfig.getoption('verbose') > 0: + setOverrideLogLevel('DEBUG') + with pushd_popd(tmpdir): + directory = str(tmpdir) + resolver = Resolver() + workspace = resolver.workspace_from_url(workspace_path, dst_dir=directory, download=True) + config.OCRD_MISSING_OUTPUT = "ABORT" + if 'metscache' in request.param: + config.OCRD_METS_CACHING = True + print("enabled METS caching") + if 'pageparallel' in request.param: + config.OCRD_MAX_PARALLEL_PAGES = 4 + print("enabled page-parallel processing") + def _start_mets_server(*args, **kwargs): + print("running with METS server") + server = OcrdMetsServer(*args, **kwargs) + server.startup() + process = Process(target=_start_mets_server, + kwargs={'workspace': workspace, 'url': 'mets.sock'}) + process.start() + sleep(1) + workspace = Workspace(resolver, directory, mets_server_url='mets.sock') + yield {'workspace': workspace, 'mets_server_url': 'mets.sock'} + process.terminate() + else: + yield {'workspace': workspace} + config.reset_defaults() + disableLogging() + return _make_workspace + + +@pytest.fixture +def workspace_manifesto(workspace): + yield from workspace(assets.path_to('communist_manifesto/data/mets.xml')) + +@pytest.fixture +def workspace_aufklaerung(workspace): + yield from workspace(assets.path_to('kant_aufklaerung_1784/data/mets.xml')) + +@pytest.fixture +def workspace_aufklaerung_region(workspace): + yield from workspace(assets.path_to('kant_aufklaerung_1784-page-region/data/mets.xml')) + +@pytest.fixture +def workspace_sbb(workspace): + yield from workspace(assets.url_of('SBB0000F29300010000/data/mets_one_file.xml')) diff --git a/tests/test_binarize.py b/tests/test_binarize.py index 8f5c4b8..da9adea 100644 --- a/tests/test_binarize.py +++ b/tests/test_binarize.py @@ -1,60 +1,61 @@ # pylint: disable=import-error +import json import os -import shutil -import pytest -from tests.base import assets, main +from ocrd import run_processor +from ocrd_utils import MIMETYPE_PAGE +from ocrd_models.constants import NAMESPACES +from ocrd_modelfactory import page_from_file -from ocrd import Resolver from ocrd_kraken.binarize import KrakenBinarize -from ocrd_utils.logging import setOverrideLogLevel -setOverrideLogLevel('DEBUG') +from .assets import assets -PARAM_JSON = assets.url_of('param-binarize.json') +PARAM_JSON = assets.url_of('param-binarize.json') -@pytest.fixture() -def workspace(tmpdir): - if os.path.exists(tmpdir): - shutil.rmtree(tmpdir) - workspace = Resolver().workspace_from_url( - assets.path_to('kant_aufklaerung_1784/data/mets.xml'), - dst_dir=tmpdir, - download=True +def analyse_result(ws, level): + assert os.path.isdir(os.path.join(ws.directory, 'OCR-D-BIN-KRAKEN')) + out_files = list(ws.find_files(fileGrp="OCR-D-BIN-KRAKEN", mimetype=MIMETYPE_PAGE)) + assert len(out_files), "found no output PAGE file" + out_images = list(ws.find_files(fileGrp="OCR-D-BIN-KRAKEN", mimetype="//^image/.*")) + assert len(out_images), "found no output image file" + out_pcgts = page_from_file(out_files[0]) + assert out_pcgts is not None + out_images = out_pcgts.etree.xpath('//page:%s/page:AlternativeImage[contains(@comments,"binarized")]' % level, namespaces=NAMESPACES) + assert len(out_images) > 0, "found no binarized AlternativeImages in output PAGE file" + +def test_param_json(workspace_sbb): + run_processor(KrakenBinarize, + input_file_grp="OCR-D-IMG", + output_file_grp="OCR-D-BIN-KRAKEN", + parameter=json.load(open(PARAM_JSON)), + **workspace_sbb, ) - return workspace - - -# def test_param_json(self): -# workspace = resolver.workspace_from_url(assets.url_of('SBB0000F29300010000/data/mets_one_file.xml'), dst_dir=WORKSPACE_DIR) -# run_processor( -# KrakenBinarize, -# resolver=resolver, -# workspace=workspace, -# parameter=PARAM_JSON -# ) - -def test_binarize_regions(workspace): - proc = KrakenBinarize( - workspace, - input_file_grp="OCR-D-GT-PAGE", - output_file_grp="OCR-D-IMG-BIN-KRAKEN", - parameter={'level-of-operation': 'region'} + ws = workspace_sbb['workspace'] + ws.save_mets() + analyse_result(ws, 'Page') + +def test_binarize_regions(workspace_aufklaerung): + run_processor(KrakenBinarize, + input_file_grp="OCR-D-GT-PAGE", + output_file_grp="OCR-D-BIN-KRAKEN", + parameter={'level-of-operation': 'region'}, + **workspace_aufklaerung, ) - proc.process() - workspace.save_mets() - -def test_binarize_lines(workspace): - proc = KrakenBinarize( - workspace, - input_file_grp="OCR-D-GT-PAGE", - output_file_grp="OCR-D-IMG-BIN-KRAKEN", - parameter={'level-of-operation': 'line'} + ws = workspace_aufklaerung['workspace'] + ws.save_mets() + analyse_result(ws, 'TextRegion') + +def test_binarize_lines(workspace_aufklaerung): + run_processor(KrakenBinarize, + input_file_grp="OCR-D-GT-PAGE", + output_file_grp="OCR-D-BIN-KRAKEN", + parameter={'level-of-operation': 'line'}, + **workspace_aufklaerung, ) - proc.process() - workspace.save_mets() + ws = workspace_aufklaerung['workspace'] + ws.save_mets() + analyse_result(ws, 'TextLine') -if __name__ == "__main__": - main(__file__) diff --git a/tests/test_recognize.py b/tests/test_recognize.py index 0ae2850..8354a0e 100644 --- a/tests/test_recognize.py +++ b/tests/test_recognize.py @@ -1,32 +1,36 @@ # pylint: disable=import-error import os -import shutil -from tests.base import TestCase, assets, main +from ocrd import run_processor +from ocrd_utils import MIMETYPE_PAGE +from ocrd_models.constants import NAMESPACES +from ocrd_modelfactory import page_from_file -from ocrd import Resolver, run_processor -from ocrd_utils import initLogging, pushd_popd from ocrd_kraken.recognize import KrakenRecognize +from ocrd_kraken.binarize import KrakenBinarize -class TestKrakenRecognize(TestCase): - def setUp(self): - initLogging() - - def test_recognize(self): - resolver = Resolver() - # with pushd_popd('/tmp/kraken-test') as tempdir: - with pushd_popd(tempdir=True) as tempdir: - workspace = resolver.workspace_from_url(assets.path_to('communist_manifesto/data/mets.xml'), dst_dir=tempdir, download=True) - workspace.overwrite_mode = True - proc = KrakenRecognize( - workspace, - input_file_grp="OCR-D-SEG-KRAKEN", - output_file_grp="OCR-D-OCR-KRAKEN", - ) - proc.process() - workspace.save_mets() - -if __name__ == "__main__": - main(__file__) +def test_recognize(workspace_aufklaerung): + # some models (like default en) require binarized images + run_processor(KrakenBinarize, + input_file_grp="OCR-D-GT-PAGE", + output_file_grp="OCR-D-GT-PAGE-BIN", + **workspace_aufklaerung, + ) + run_processor(KrakenRecognize, + # re-use layout, overwrite text: + input_file_grp="OCR-D-GT-PAGE-BIN", + output_file_grp="OCR-D-OCR-KRAKEN", + parameter={'overwrite_text': True}, + **workspace_aufklaerung, + ) + ws = workspace_aufklaerung['workspace'] + ws.save_mets() + assert os.path.isdir(os.path.join(ws.directory, 'OCR-D-OCR-KRAKEN')) + results = ws.find_files(file_grp='OCR-D-OCR-KRAKEN', mimetype=MIMETYPE_PAGE) + result0 = next(results, False) + assert result0, "found no output PAGE file" + result0 = page_from_file(result0) + text0 = result0.etree.xpath('//page:Glyph/page:TextEquiv/page:Unicode', namespaces=NAMESPACES) + assert len(text0) > 0, "found no glyph text in output PAGE file" diff --git a/tests/test_segment.py b/tests/test_segment.py index 627fbbf..6c00880 100644 --- a/tests/test_segment.py +++ b/tests/test_segment.py @@ -1,59 +1,65 @@ # pylint: disable=import-error import os -import shutil -from tests.base import TestCase, assets, main +from ocrd import run_processor +from ocrd_utils import MIMETYPE_PAGE +from ocrd_models.constants import NAMESPACES +from ocrd_modelfactory import page_from_file -from ocrd import Resolver -from ocrd_utils import initLogging, pushd_popd from ocrd_kraken.segment import KrakenSegment +from ocrd_kraken.binarize import KrakenBinarize -class TestKrakenSegment(TestCase): - - def setUp(self): - initLogging() - - def test_run_blla(self): - resolver = Resolver() - with pushd_popd(tempdir=True) as tempdir: - workspace = resolver.workspace_from_url(assets.path_to('communist_manifesto/data/mets.xml'), dst_dir=tempdir, download=True) - proc = KrakenSegment( - workspace, - input_file_grp="OCR-D-IMG-BIN", - output_file_grp="OCR-D-SEG-LINE-KRAKEN", - parameter={'maxcolseps': 0, 'use_legacy': False} - ) - proc.process() - workspace.save_mets() - - def test_run_blla_regionlevel(self): - resolver = Resolver() - with pushd_popd(tempdir=True) as tempdir: - workspace = resolver.workspace_from_url(assets.path_to('kant_aufklaerung_1784-page-region/data/mets.xml'), dst_dir=tempdir, download=True) - proc = KrakenSegment( - workspace, - input_file_grp="OCR-D-GT-SEG-REGION", - output_file_grp="OCR-D-SEG-LINE-KRAKEN", - page_id="phys_0005", - parameter={'maxcolseps': 0, 'use_legacy': False} - ) - proc.process() - workspace.save_mets() - - def test_run_legacy(self): - resolver = Resolver() - # with pushd_popd('/tmp/kraken-test') as tempdir: - with pushd_popd(tempdir=True) as tempdir: - workspace = resolver.workspace_from_url(assets.path_to('communist_manifesto/data/mets.xml'), dst_dir=tempdir, download=True) - proc = KrakenSegment( - workspace, - input_file_grp="OCR-D-IMG-BIN", - output_file_grp="OCR-D-SEG-LINE-KRAKEN", - parameter={'maxcolseps': 0, 'use_legacy': True} - ) - proc.process() - workspace.save_mets() - -if __name__ == "__main__": - main(__file__) + +def analyse_result(ws): + assert os.path.isdir(os.path.join(ws.directory, 'OCR-D-SEG-LINE-KRAKEN')) + out_files = list(ws.find_files(fileGrp="OCR-D-SEG-LINE-KRAKEN", mimetype=MIMETYPE_PAGE)) + assert len(out_files), "found no output PAGE file" + out_pcgts = page_from_file(out_files[0]) + assert out_pcgts is not None + out_regions = out_pcgts.etree.xpath('//page:TextRegion/page:Coords', namespaces=NAMESPACES) + assert len(out_regions) > 0, "found no text regions in output PAGE file" + out_lines = out_pcgts.get_Page().get_AllTextLines() + assert len(out_lines), "found no text lines in output PAGE file" + +def test_run_blla(workspace_aufklaerung): + run_processor(KrakenSegment, + input_file_grp="OCR-D-IMG", + output_file_grp="OCR-D-SEG-LINE-KRAKEN", + parameter={'maxcolseps': 0, 'use_legacy': False}, + **workspace_aufklaerung, + ) + ws = workspace_aufklaerung['workspace'] + ws.save_mets() + analyse_result(ws) + +def test_run_blla_regionlevel(workspace_aufklaerung_region): + run_processor(KrakenSegment, + input_file_grp="OCR-D-GT-SEG-REGION", + output_file_grp="OCR-D-SEG-LINE-KRAKEN", + # only 1 page (takes 3min per page without GPU) + page_id="phys_0005", + parameter={'maxcolseps': 0, 'use_legacy': False}, + **workspace_aufklaerung_region, + ) + ws = workspace_aufklaerung_region['workspace'] + ws.save_mets() + analyse_result(ws) + +def test_run_legacy(workspace_aufklaerung): + # legacy segmentation requires binarized images + run_processor(KrakenBinarize, + input_file_grp="OCR-D-GT-PAGE", + output_file_grp="OCR-D-GT-PAGE-BIN", + **workspace_aufklaerung, + ) + run_processor(KrakenSegment, + # overwrite layout: + input_file_grp="OCR-D-GT-PAGE-BIN", + output_file_grp="OCR-D-SEG-LINE-KRAKEN", + parameter={'maxcolseps': 0, 'use_legacy': True, 'overwrite_segments': True}, + **workspace_aufklaerung, + ) + ws = workspace_aufklaerung['workspace'] + ws.save_mets() + analyse_result(ws)