From 03c38d92b556e93f1c53e019cc0a21a0af62b076 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Wed, 10 Feb 2021 11:40:56 +0100 Subject: [PATCH] :package: 0.1.9 (extract*: expose param `feature_filter`) --- CHANGELOG.md | 7 ++++++ ocrd_segment/extract_glyphs.py | 8 +++++++ ocrd_segment/extract_lines.py | 11 +++++---- ocrd_segment/extract_pages.py | 6 ++++- ocrd_segment/extract_words.py | 7 ++++++ ocrd_segment/ocrd-tool.json | 43 ++++++++++++++++++++++++---------- 6 files changed, 65 insertions(+), 17 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 896a73f..3e967fd 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,12 @@ Versioned according to [Semantic Versioning](http://semver.org/). ## Unreleased +## [0.1.9] + +Changed: + + * extract-regions/lines/words/glyphs: add `feature_filter` param + ## [0.1.8] Fixed: @@ -89,6 +95,7 @@ Changed: * further improve README +[0.1.9]: ../../compare/v0.1.8...v0.1.9 [0.1.8]: ../../compare/v0.1.7...v0.1.8 [0.1.7]: ../../compare/v0.1.6...v0.1.7 [0.1.6]: ../../compare/v0.1.5...v0.1.6 diff --git a/ocrd_segment/extract_glyphs.py b/ocrd_segment/extract_glyphs.py index e601054..5f3efa7 100644 --- a/ocrd_segment/extract_glyphs.py +++ b/ocrd_segment/extract_glyphs.py @@ -34,6 +34,9 @@ def process(self): Extract an image for each glyph (which depending on the workflow can already be deskewed, dewarped, binarized etc.), cropped to its minimal bounding box, and masked by the coordinate polygon outline. + Apply ``feature_filter`` (a comma-separated list of image features, + cf. :py:func:`ocrd.workspace.Workspace.image_from_page`) to skip + specific features when retrieving derived images. If ``transparency`` is true, then also add an alpha channel which is fully transparent outside of the mask. @@ -75,6 +78,7 @@ def process(self): page = pcgts.get_Page() page_image, page_coords, page_image_info = self.workspace.image_from_page( page, page_id, + feature_filter=self.parameter['feature_filter'], transparency=self.parameter['transparency']) if page_image_info.resolution != 1: dpi = page_image_info.resolution @@ -92,6 +96,7 @@ def process(self): for region in regions: region_image, region_coords = self.workspace.image_from_segment( region, page_image, page_coords, + feature_filter=self.parameter['feature_filter'], transparency=self.parameter['transparency']) rtype = region.get_type() @@ -101,6 +106,7 @@ def process(self): for line in lines: line_image, line_coords = self.workspace.image_from_segment( line, region_image, region_coords, + feature_filter=self.parameter['feature_filter'], transparency=self.parameter['transparency']) words = line.get_Word() if not words: @@ -108,6 +114,7 @@ def process(self): for word in words: word_image, word_coords = self.workspace.image_from_segment( word, line_image, line_coords, + feature_filter=self.parameter['feature_filter'], transparency=self.parameter['transparency']) glyphs = word.get_Glyph() if not glyphs: @@ -115,6 +122,7 @@ def process(self): for glyph in glyphs: glyph_image, glyph_coords = self.workspace.image_from_segment( glyph, word_image, word_coords, + feature_filter=self.parameter['feature_filter'], transparency=self.parameter['transparency']) lpolygon_rel = coordinates_of_segment( glyph, glyph_image, glyph_coords).tolist() diff --git a/ocrd_segment/extract_lines.py b/ocrd_segment/extract_lines.py index ea81dad..a32c55a 100644 --- a/ocrd_segment/extract_lines.py +++ b/ocrd_segment/extract_lines.py @@ -1,7 +1,6 @@ from __future__ import absolute_import import json -import itertools from ocrd_utils import ( getLogger, @@ -34,6 +33,9 @@ def process(self): Extract an image for each textline (which depending on the workflow can already be deskewed, dewarped, binarized etc.), cropped to its minimal bounding box, and masked by the coordinate polygon outline. + Apply ``feature_filter`` (a comma-separated list of image features, + cf. :py:func:`ocrd.workspace.Workspace.image_from_page`) to skip + specific features when retrieving derived images. If ``transparency`` is true, then also add an alpha channel which is fully transparent outside of the mask. @@ -75,6 +77,7 @@ def process(self): page = pcgts.get_Page() page_image, page_coords, page_image_info = self.workspace.image_from_page( page, page_id, + feature_filter=self.parameter['feature_filter'], transparency=self.parameter['transparency']) if page_image_info.resolution != 1: dpi = page_image_info.resolution @@ -84,14 +87,13 @@ def process(self): dpi = None ptype = page.get_type() - regions = itertools.chain.from_iterable( - [page.get_TextRegion()] + - [subregion.get_TextRegion() for subregion in page.get_TableRegion()]) + regions = page.get_AllRegions(classes=['Text'], order='reading-order') if not regions: LOG.warning("Page '%s' contains no text regions", page_id) for region in regions: region_image, region_coords = self.workspace.image_from_segment( region, page_image, page_coords, + feature_filter=self.parameter['feature_filter'], transparency=self.parameter['transparency']) rtype = region.get_type() @@ -101,6 +103,7 @@ def process(self): for line in lines: line_image, line_coords = self.workspace.image_from_segment( line, region_image, region_coords, + feature_filter=self.parameter['feature_filter'], transparency=self.parameter['transparency']) lpolygon_rel = coordinates_of_segment( line, line_image, line_coords).tolist() diff --git a/ocrd_segment/extract_pages.py b/ocrd_segment/extract_pages.py index d450a70..7cf0d15 100644 --- a/ocrd_segment/extract_pages.py +++ b/ocrd_segment/extract_pages.py @@ -180,7 +180,11 @@ def process(self): # pylint: disable=attribute-defined-outside-init for n, input_file in enumerate(self.input_files): page_id = input_file.pageId or input_file.ID - num_page_id = int(page_id.strip(page_id.strip("0123456789"))) + try: + # separate non-numeric part of page ID to retain the numeric part + num_page_id = int(page_id.strip(page_id.strip("0123456789"))) + except Exception: + num_page_id = n LOG.info("INPUT FILE %i / %s", n, page_id) pcgts = page_from_file(self.workspace.download_file(input_file)) self.add_metadata(pcgts) diff --git a/ocrd_segment/extract_words.py b/ocrd_segment/extract_words.py index 0c93a0b..b4b126a 100644 --- a/ocrd_segment/extract_words.py +++ b/ocrd_segment/extract_words.py @@ -34,6 +34,9 @@ def process(self): Extract an image for each word (which depending on the workflow can already be deskewed, dewarped, binarized etc.), cropped to its minimal bounding box, and masked by the coordinate polygon outline. + Apply ``feature_filter`` (a comma-separated list of image features, + cf. :py:func:`ocrd.workspace.Workspace.image_from_page`) to skip + specific features when retrieving derived images. If ``transparency`` is true, then also add an alpha channel which is fully transparent outside of the mask. @@ -75,6 +78,7 @@ def process(self): page = pcgts.get_Page() page_image, page_coords, page_image_info = self.workspace.image_from_page( page, page_id, + feature_filter=self.parameter['feature_filter'], transparency=self.parameter['transparency']) if page_image_info.resolution != 1: dpi = page_image_info.resolution @@ -92,6 +96,7 @@ def process(self): for region in regions: region_image, region_coords = self.workspace.image_from_segment( region, page_image, page_coords, + feature_filter=self.parameter['feature_filter'], transparency=self.parameter['transparency']) rtype = region.get_type() @@ -101,6 +106,7 @@ def process(self): for line in lines: line_image, line_coords = self.workspace.image_from_segment( line, region_image, region_coords, + feature_filter=self.parameter['feature_filter'], transparency=self.parameter['transparency']) words = line.get_Word() if not words: @@ -108,6 +114,7 @@ def process(self): for word in words: word_image, word_coords = self.workspace.image_from_segment( word, line_image, line_coords, + feature_filter=self.parameter['feature_filter'], transparency=self.parameter['transparency']) lpolygon_rel = coordinates_of_segment( word, word_image, word_coords).tolist() diff --git a/ocrd_segment/ocrd-tool.json b/ocrd_segment/ocrd-tool.json index af16e84..f3cdb40 100644 --- a/ocrd_segment/ocrd-tool.json +++ b/ocrd_segment/ocrd-tool.json @@ -1,5 +1,5 @@ { - "version": "0.1.8", + "version": "0.1.9", "git_url": "https://github.com/OCR-D/ocrd_segment", "tools": { "ocrd-segment-repair": { @@ -131,7 +131,7 @@ "OCR-D-GT-SEG-BLOCK" ], "output_file_grp": [ - "OCR-D-IMG-CROP" + "OCR-D-IMG-PAGE" ], "steps": ["layout/analysis"], "parameters": { @@ -231,10 +231,15 @@ "OCR-D-GT-SEG-BLOCK" ], "output_file_grp": [ - "OCR-D-IMG-CROP" + "OCR-D-IMG-REGION" ], "steps": ["layout/analysis"], "parameters": { + "feature_filter": { + "type": "string", + "default": "", + "description": "Comma-separated list of forbidden image features (e.g. `binarized,despeckled`)." + }, "mimetype": { "type": "string", "enum": ["image/bmp", "application/postscript", "image/gif", "image/jpeg", "image/jp2", "image/png", "image/x-portable-pixmap", "image/tiff"], @@ -257,10 +262,15 @@ "OCR-D-GT-SEG-LINE" ], "output_file_grp": [ - "OCR-D-IMG-CROP" + "OCR-D-IMG-LINE" ], "steps": ["layout/analysis"], "parameters": { + "feature_filter": { + "type": "string", + "default": "", + "description": "Comma-separated list of forbidden image features (e.g. `binarized,despeckled`)." + }, "mimetype": { "type": "string", "enum": ["image/bmp", "application/postscript", "image/gif", "image/jpeg", "image/jp2", "image/png", "image/x-portable-pixmap", "image/tiff"], @@ -283,10 +293,15 @@ "OCR-D-GT-SEG-WORD" ], "output_file_grp": [ - "OCR-D-IMG-CROP" + "OCR-D-IMG-WORD" ], "steps": ["layout/analysis"], "parameters": { + "feature_filter": { + "type": "string", + "default": "", + "description": "Comma-separated list of forbidden image features (e.g. `binarized,despeckled`)." + }, "mimetype": { "type": "string", "enum": ["image/bmp", "application/postscript", "image/gif", "image/jpeg", "image/jp2", "image/png", "image/x-portable-pixmap", "image/tiff"], @@ -309,10 +324,15 @@ "OCR-D-GT-SEG-GLYPH" ], "output_file_grp": [ - "OCR-D-IMG-CROP" + "OCR-D-IMG-GLYPH" ], "steps": ["layout/analysis"], "parameters": { + "feature_filter": { + "type": "string", + "default": "", + "description": "Comma-separated list of forbidden image features (e.g. `binarized,despeckled`)." + }, "mimetype": { "type": "string", "enum": ["image/bmp", "application/postscript", "image/gif", "image/jpeg", "image/jp2", "image/png", "image/x-portable-pixmap", "image/tiff"], @@ -336,20 +356,19 @@ "OCR-D-OCR" ], "output_file_grp": [ - "OCR-D-SEG-CROP", - "OCR-D-IMG-CROP" + "OCR-D-SEG-CROP" ], "steps": ["layout/analysis"], "parameters": { "feature_selector": { "type": "string", "default": "", - "description": "comma-separated list of required image features (e.g. binarized,despeckled)" + "description": "Comma-separated list of required image features (e.g. `binarized,despeckled`)" }, "feature_filter": { "type": "string", "default": "", - "description": "comma-separated list of forbidden image features (e.g. binarized,despeckled)" + "description": "Comma-separated list of forbidden image features (e.g. `binarized,despeckled`)" }, "transform_coordinates": { "type": "boolean", @@ -368,8 +387,8 @@ "OCR-D-OCR" ], "output_file_grp": [ - "OCR-D-SEG-CROP", - "OCR-D-IMG-CROP" + "OCR-D-SEG-LINE", + "OCR-D-OCR" ], "steps": ["layout/analysis"], "parameters": {