diff --git a/src/ocrd/processor/__init__.py b/src/ocrd/processor/__init__.py index 21b0c69eb2..0b3ce5a56e 100644 --- a/src/ocrd/processor/__init__.py +++ b/src/ocrd/processor/__init__.py @@ -2,6 +2,10 @@ Processor, ResourceNotFoundError ) +from .ocrd_page_result import ( + OcrdPageResult, + OcrdPageResultImage +) from .helpers import ( run_cli, run_processor, diff --git a/src/ocrd/processor/base.py b/src/ocrd/processor/base.py index 43aec4ace0..46417ac2cc 100644 --- a/src/ocrd/processor/base.py +++ b/src/ocrd/processor/base.py @@ -9,13 +9,13 @@ 'run_processor' ] -from os.path import exists +from os.path import exists, join from shutil import copyfileobj import json import os from os import getcwd from pathlib import Path -from typing import Optional +from typing import List, Optional, Union import sys import inspect import tarfile @@ -23,6 +23,9 @@ from deprecated import deprecated from ocrd.workspace import Workspace +from ocrd_models.ocrd_file import ClientSideOcrdFile, OcrdFile +from ocrd.processor.ocrd_page_result import OcrdPageResult +from ocrd_models.ocrd_page_generateds import PcGtsType from ocrd_utils import ( VERSION as OCRD_VERSION, MIMETYPE_PAGE, @@ -198,10 +201,12 @@ def verify(self): assert self.output_file_grp is not None input_file_grps = self.input_file_grp.split(',') output_file_grps = self.output_file_grp.split(',') - def assert_file_grp_cardinality(grps, spec, msg): - if isinstance(spec, int) and spec > 0: - assert len(grps) == spec, msg % (len(grps), str(spec)) + def assert_file_grp_cardinality(grps : List[str], spec : Union[int, List[int]], msg): + if isinstance(spec, int): + if spec > 0: + assert len(grps) == spec, msg % (len(grps), str(spec)) else: + assert isinstance(spec, list) minimum = spec[0] maximum = spec[1] if minimum > 0: @@ -289,7 +294,7 @@ def process_workspace(self, workspace: Workspace) -> None: # - ResourceNotFoundError → use ResourceManager to download (once), then retry # - transient (I/O or OOM) error → maybe sleep, retry # - persistent (data) error → skip / dummy / raise - input_files = [None] * len(input_file_tuple) + input_files : List[Optional[Union[OcrdFile, ClientSideOcrdFile]]] = [None] * len(input_file_tuple) for i, input_file in enumerate(input_file_tuple): if i == 0: log.info("processing page %s", input_file.pageId) @@ -309,7 +314,7 @@ def process_workspace(self, workspace: Workspace) -> None: # fall back to deprecated method self.process() - def process_page_file(self, *input_files) -> None: + def process_page_file(self, *input_files : Optional[Union[OcrdFile, ClientSideOcrdFile]]) -> None: """ Process the given ``input_files`` of the :py:attr:`workspace`, representing one physical page (passed as one opened @@ -321,49 +326,55 @@ def process_page_file(self, *input_files) -> None: to handle cases like multiple fileGrps, non-PAGE input etc.) """ log = getLogger('ocrd.processor.base') - input_pcgts = [None] * len(input_files) + input_pcgts : List[Optional[OcrdPage]] = [None] * len(input_files) + assert isinstance(input_files[0], (OcrdFile, ClientSideOcrdFile)) page_id = input_files[0].pageId for i, input_file in enumerate(input_files): - # FIXME: what about non-PAGE input like image or JSON ??? + assert isinstance(input_file, (OcrdFile, ClientSideOcrdFile)) log.debug("parsing file %s for page %s", input_file.ID, input_file.pageId) try: - input_pcgts[i] = page_from_file(input_file) + page_ = page_from_file(input_file) + assert isinstance(page_, PcGtsType) + input_pcgts[i] = page_ except ValueError as e: log.info("non-PAGE input for page %s: %s", page_id, e) output_file_id = make_file_id(input_files[0], self.output_file_grp) - output_pcgts = self.process_page_pcgts(*input_pcgts, output_file_id=output_file_id, page_id=page_id) - if isinstance(output_pcgts, (list, tuple)): - output_images = output_pcgts[1:] - output_pcgts = output_pcgts[0] - for output_image_pil, output_image_id, output_image_path in output_images: - self.workspace.save_image_file( - output_image_pil, - output_image_id, - self.output_file_grp, - page_id=page_id, - file_path=output_image_path) - output_pcgts.set_pcGtsId(output_file_id) - self.add_metadata(output_pcgts) + result = self.process_page_pcgts(*input_pcgts, page_id=page_id) + for image_result in result.images: + image_file_id = f'{output_file_id}_{image_result.file_id_suffix}' + image_file_path = join(self.output_file_grp, f'{image_file_id}.png') + image_result.alternative_image.set_filename(image_file_path) + self.workspace.save_image_file( + image_result.pil, + image_file_id, + self.output_file_grp, + page_id=page_id, + file_path=image_file_path) + result.pcgts.set_pcGtsId(output_file_id) + self.add_metadata(result.pcgts) # FIXME: what about non-PAGE output like JSON ??? self.workspace.add_file(file_id=output_file_id, file_grp=self.output_file_grp, page_id=page_id, local_filename=os.path.join(self.output_file_grp, output_file_id + '.xml'), mimetype=MIMETYPE_PAGE, - content=to_xml(output_pcgts)) + content=to_xml(result.pcgts)) - def process_page_pcgts(self, *input_pcgts : OcrdPage, output_file_id : Optional[str] = None, page_id : Optional[str] = None) -> OcrdPage: + def process_page_pcgts(self, *input_pcgts : Optional[OcrdPage], page_id : Optional[str] = None) -> OcrdPageResult: """ Process the given ``input_pcgts`` of the :py:attr:`workspace`, representing one physical page (passed as one parsed :py:class:`~ocrd_models.OcrdPage` per input fileGrp) under the given :py:attr:`parameter`, and return the - resulting :py:class:`~ocrd_models.OcrdPage`. + resulting :py:class:`~ocrd.processor.OcrdPageResult`. - Optionally, return a list or tuple of the :py:class:`~ocrd_models.OcrdPage` - and one or more lists or tuples of :py:class:`PIL.Image` (image data), - :py:class:str (file ID) and :py:class:str (file path) of derived images - to be annotated along with the resulting PAGE file. + Optionally, add to the ``images`` attribute of the resulting + :py:class:`~ocrd.processor.OcrdPageResult` instances + of :py:class:`~ocrd.processor.OcrdPageResultImage`, + which have required fields for ``pil`` (:py:class:`PIL.Image` image data), + ``file_id_suffix`` (used for generating IDs of the saved image) and + ``alternative_image`` (reference of the :py:class:`ocrd_models.ocrd_page.AlternativeImageType` + for setting the filename of the saved image). (This contains the main functionality and must be overridden by subclasses.) """ @@ -374,7 +385,9 @@ def add_metadata(self, pcgts: OcrdPage) -> None: Add PAGE-XML :py:class:`~ocrd_models.ocrd_page.MetadataItemType` ``MetadataItem`` describing the processing step and runtime parameters to :py:class:`~ocrd_models.ocrd_page.PcGtsType` ``pcgts``. """ - pcgts.get_Metadata().add_MetadataItem( + metadata_obj = pcgts.get_Metadata() + assert metadata_obj is not None + metadata_obj.add_MetadataItem( MetadataItemType(type_="processingStep", name=self.ocrd_tool['steps'][0], value=self.ocrd_tool['executable'], diff --git a/src/ocrd/processor/builtin/dummy_processor.py b/src/ocrd/processor/builtin/dummy_processor.py index b05ca9e6da..5ef76d2fa5 100644 --- a/src/ocrd/processor/builtin/dummy_processor.py +++ b/src/ocrd/processor/builtin/dummy_processor.py @@ -1,14 +1,17 @@ # pylint: disable=missing-module-docstring,invalid-name from os.path import join, basename +from typing import Optional, Union import click from ocrd import Processor from ocrd.decorators import ocrd_cli_options, ocrd_cli_wrap_processor -from ocrd_models.ocrd_page import to_xml +from ocrd.processor.ocrd_page_result import OcrdPageResult +from ocrd_models.ocrd_file import ClientSideOcrdFile, OcrdFile +from ocrd_models.ocrd_page import OcrdPage, to_xml +from ocrd_models.ocrd_page_generateds import PcGtsType from ocrd_utils import ( getLogger, - assert_file_grp_cardinality, make_file_id, MIME_TO_EXT, MIMETYPE_PAGE, @@ -24,13 +27,16 @@ class DummyProcessor(Processor): Bare-bones processor creates PAGE-XML and optionally copies file from input group to output group """ - def process_page_pcgts(self, *input_pcgts, output_file_id=None, page_id=None): + def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional[str] = None) -> OcrdPageResult: + assert input_pcgts[0] # nothing to do here - return input_pcgts[0] + return OcrdPageResult(input_pcgts[0]) - def process_page_file(self, *input_files): + def process_page_file(self, *input_files: Optional[Union[OcrdFile, ClientSideOcrdFile]]) -> None: LOG = getLogger('ocrd.dummy') input_file = input_files[0] + assert input_file + assert input_file.local_filename if self.parameter['copy_files'] and input_file.mimetype != MIMETYPE_PAGE: # we need to mimic the actual copying in addition to the PAGE boilerplate file_id = make_file_id(input_file, self.output_file_grp) @@ -48,7 +54,8 @@ def process_page_file(self, *input_files): content=content) file_id = file_id + '_PAGE' pcgts = page_from_file(output_file) - pcgts = self.process_page_pcgts(pcgts) + assert isinstance(pcgts, PcGtsType) + pcgts = self.process_page_pcgts(pcgts).pcgts pcgts.set_pcGtsId(file_id) self.add_metadata(pcgts) LOG.info("Add PAGE-XML %s generated for %s", file_id, output_file) diff --git a/src/ocrd/processor/ocrd_page_result.py b/src/ocrd/processor/ocrd_page_result.py new file mode 100644 index 0000000000..c63330c734 --- /dev/null +++ b/src/ocrd/processor/ocrd_page_result.py @@ -0,0 +1,17 @@ +from dataclasses import dataclass, field +from typing import List +from ocrd_models.ocrd_page import OcrdPage +from PIL.Image import Image + +from ocrd_models.ocrd_page_generateds import AlternativeImageType + +@dataclass +class OcrdPageResultImage(): + pil : Image + file_id_suffix : str + alternative_image : AlternativeImageType + +@dataclass +class OcrdPageResult(): + pcgts : OcrdPage + images : List[OcrdPageResultImage] = field(default_factory=list) diff --git a/src/ocrd/workspace.py b/src/ocrd/workspace.py index fc619b7d0b..eeaa6434fd 100644 --- a/src/ocrd/workspace.py +++ b/src/ocrd/workspace.py @@ -1073,7 +1073,7 @@ def image_from_segment(self, segment, parent_image, parent_coords, return segment_image, segment_coords # pylint: disable=redefined-builtin - def save_image_file(self, image : Image, + def save_image_file(self, image : Image.Image, file_id : str, file_grp : str, file_path : Optional[str] = None, diff --git a/src/ocrd_modelfactory/__init__.py b/src/ocrd_modelfactory/__init__.py index 7afc5b1765..a98499b2e2 100644 --- a/src/ocrd_modelfactory/__init__.py +++ b/src/ocrd_modelfactory/__init__.py @@ -79,7 +79,7 @@ def page_from_image(input_file, with_tree=False): revmap = dict(((node, element) for element, node in mapping.items())) return pcgts, etree, mapping, revmap -def page_from_file(input_file, with_tree=False) -> Union[PcGtsType, Tuple[PcGtsType, ET.Element, dict, dict]]: +def page_from_file(input_file, with_tree=False) -> Union[PcGtsType, Tuple[PcGtsType, ET._Element, dict, dict]]: """ Create :py:class:`~ocrd_models.ocrd_page.OcrdPage` from an :py:class:`~ocrd_models.ocrd_file.OcrdFile` or a file path diff --git a/src/ocrd_models/__init__.py b/src/ocrd_models/__init__.py index a89ee1dec8..330fefe97d 100644 --- a/src/ocrd_models/__init__.py +++ b/src/ocrd_models/__init__.py @@ -5,5 +5,6 @@ from .ocrd_exif import OcrdExif from .ocrd_file import OcrdFile, ClientSideOcrdFile from .ocrd_mets import OcrdMets +from .ocrd_page import OcrdPage from .ocrd_xml_base import OcrdXmlDocument from .report import ValidationReport