From 03c38d92b556e93f1c53e019cc0a21a0af62b076 Mon Sep 17 00:00:00 2001
From: Robert Sachunsky <sachunsky@informatik.uni-leipzig.de>
Date: Wed, 10 Feb 2021 11:40:56 +0100
Subject: [PATCH] :package: 0.1.9 (extract*: expose param `feature_filter`)

---
 CHANGELOG.md                   |  7 ++++++
 ocrd_segment/extract_glyphs.py |  8 +++++++
 ocrd_segment/extract_lines.py  | 11 +++++----
 ocrd_segment/extract_pages.py  |  6 ++++-
 ocrd_segment/extract_words.py  |  7 ++++++
 ocrd_segment/ocrd-tool.json    | 43 ++++++++++++++++++++++++----------
 6 files changed, 65 insertions(+), 17 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 896a73f..3e967fd 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -4,6 +4,12 @@ Versioned according to [Semantic Versioning](http://semver.org/).
 
 ## Unreleased
 
+## [0.1.9]
+
+Changed:
+
+ * extract-regions/lines/words/glyphs: add `feature_filter` param
+
 ## [0.1.8]
 
 Fixed:
@@ -89,6 +95,7 @@ Changed:
   * further improve README
 
 <!-- link-labels -->
+[0.1.9]: ../../compare/v0.1.8...v0.1.9
 [0.1.8]: ../../compare/v0.1.7...v0.1.8
 [0.1.7]: ../../compare/v0.1.6...v0.1.7
 [0.1.6]: ../../compare/v0.1.5...v0.1.6
diff --git a/ocrd_segment/extract_glyphs.py b/ocrd_segment/extract_glyphs.py
index e601054..5f3efa7 100644
--- a/ocrd_segment/extract_glyphs.py
+++ b/ocrd_segment/extract_glyphs.py
@@ -34,6 +34,9 @@ def process(self):
         Extract an image for each glyph (which depending on the workflow
         can already be deskewed, dewarped, binarized etc.), cropped to its
         minimal bounding box, and masked by the coordinate polygon outline.
+        Apply ``feature_filter`` (a comma-separated list of image features,
+        cf. :py:func:`ocrd.workspace.Workspace.image_from_page`) to skip
+        specific features when retrieving derived images.
         If ``transparency`` is true, then also add an alpha channel which is
         fully transparent outside of the mask.
         
@@ -75,6 +78,7 @@ def process(self):
             page = pcgts.get_Page()
             page_image, page_coords, page_image_info = self.workspace.image_from_page(
                 page, page_id,
+                feature_filter=self.parameter['feature_filter'],
                 transparency=self.parameter['transparency'])
             if page_image_info.resolution != 1:
                 dpi = page_image_info.resolution
@@ -92,6 +96,7 @@ def process(self):
             for region in regions:
                 region_image, region_coords = self.workspace.image_from_segment(
                     region, page_image, page_coords,
+                    feature_filter=self.parameter['feature_filter'],
                     transparency=self.parameter['transparency'])
                 rtype = region.get_type()
                 
@@ -101,6 +106,7 @@ def process(self):
                 for line in lines:
                     line_image, line_coords = self.workspace.image_from_segment(
                         line, region_image, region_coords,
+                        feature_filter=self.parameter['feature_filter'],
                         transparency=self.parameter['transparency'])
                     words = line.get_Word()
                     if not words:
@@ -108,6 +114,7 @@ def process(self):
                     for word in words:
                         word_image, word_coords = self.workspace.image_from_segment(
                             word, line_image, line_coords,
+                            feature_filter=self.parameter['feature_filter'],
                             transparency=self.parameter['transparency'])
                         glyphs = word.get_Glyph()
                         if not glyphs:
@@ -115,6 +122,7 @@ def process(self):
                         for glyph in glyphs:
                             glyph_image, glyph_coords = self.workspace.image_from_segment(
                                 glyph, word_image, word_coords,
+                                feature_filter=self.parameter['feature_filter'],
                                 transparency=self.parameter['transparency'])
                             lpolygon_rel = coordinates_of_segment(
                                 glyph, glyph_image, glyph_coords).tolist()
diff --git a/ocrd_segment/extract_lines.py b/ocrd_segment/extract_lines.py
index ea81dad..a32c55a 100644
--- a/ocrd_segment/extract_lines.py
+++ b/ocrd_segment/extract_lines.py
@@ -1,7 +1,6 @@
 from __future__ import absolute_import
 
 import json
-import itertools
 
 from ocrd_utils import (
     getLogger,
@@ -34,6 +33,9 @@ def process(self):
         Extract an image for each textline (which depending on the workflow
         can already be deskewed, dewarped, binarized etc.), cropped to its
         minimal bounding box, and masked by the coordinate polygon outline.
+        Apply ``feature_filter`` (a comma-separated list of image features,
+        cf. :py:func:`ocrd.workspace.Workspace.image_from_page`) to skip
+        specific features when retrieving derived images.
         If ``transparency`` is true, then also add an alpha channel which is
         fully transparent outside of the mask.
         
@@ -75,6 +77,7 @@ def process(self):
             page = pcgts.get_Page()
             page_image, page_coords, page_image_info = self.workspace.image_from_page(
                 page, page_id,
+                feature_filter=self.parameter['feature_filter'],
                 transparency=self.parameter['transparency'])
             if page_image_info.resolution != 1:
                 dpi = page_image_info.resolution
@@ -84,14 +87,13 @@ def process(self):
                 dpi = None
             ptype = page.get_type()
             
-            regions = itertools.chain.from_iterable(
-                [page.get_TextRegion()] +
-                [subregion.get_TextRegion() for subregion in page.get_TableRegion()])
+            regions = page.get_AllRegions(classes=['Text'], order='reading-order')
             if not regions:
                 LOG.warning("Page '%s' contains no text regions", page_id)
             for region in regions:
                 region_image, region_coords = self.workspace.image_from_segment(
                     region, page_image, page_coords,
+                    feature_filter=self.parameter['feature_filter'],
                     transparency=self.parameter['transparency'])
                 rtype = region.get_type()
                 
@@ -101,6 +103,7 @@ def process(self):
                 for line in lines:
                     line_image, line_coords = self.workspace.image_from_segment(
                         line, region_image, region_coords,
+                        feature_filter=self.parameter['feature_filter'],
                         transparency=self.parameter['transparency'])
                     lpolygon_rel = coordinates_of_segment(
                         line, line_image, line_coords).tolist()
diff --git a/ocrd_segment/extract_pages.py b/ocrd_segment/extract_pages.py
index d450a70..7cf0d15 100644
--- a/ocrd_segment/extract_pages.py
+++ b/ocrd_segment/extract_pages.py
@@ -180,7 +180,11 @@ def process(self):
         # pylint: disable=attribute-defined-outside-init
         for n, input_file in enumerate(self.input_files):
             page_id = input_file.pageId or input_file.ID
-            num_page_id = int(page_id.strip(page_id.strip("0123456789")))
+            try:
+                # separate non-numeric part of page ID to retain the numeric part
+                num_page_id = int(page_id.strip(page_id.strip("0123456789")))
+            except Exception:
+                num_page_id = n
             LOG.info("INPUT FILE %i / %s", n, page_id)
             pcgts = page_from_file(self.workspace.download_file(input_file))
             self.add_metadata(pcgts)
diff --git a/ocrd_segment/extract_words.py b/ocrd_segment/extract_words.py
index 0c93a0b..b4b126a 100644
--- a/ocrd_segment/extract_words.py
+++ b/ocrd_segment/extract_words.py
@@ -34,6 +34,9 @@ def process(self):
         Extract an image for each word (which depending on the workflow
         can already be deskewed, dewarped, binarized etc.), cropped to its
         minimal bounding box, and masked by the coordinate polygon outline.
+        Apply ``feature_filter`` (a comma-separated list of image features,
+        cf. :py:func:`ocrd.workspace.Workspace.image_from_page`) to skip
+        specific features when retrieving derived images.
         If ``transparency`` is true, then also add an alpha channel which is
         fully transparent outside of the mask.
         
@@ -75,6 +78,7 @@ def process(self):
             page = pcgts.get_Page()
             page_image, page_coords, page_image_info = self.workspace.image_from_page(
                 page, page_id,
+                feature_filter=self.parameter['feature_filter'],
                 transparency=self.parameter['transparency'])
             if page_image_info.resolution != 1:
                 dpi = page_image_info.resolution
@@ -92,6 +96,7 @@ def process(self):
             for region in regions:
                 region_image, region_coords = self.workspace.image_from_segment(
                     region, page_image, page_coords,
+                    feature_filter=self.parameter['feature_filter'],
                     transparency=self.parameter['transparency'])
                 rtype = region.get_type()
                 
@@ -101,6 +106,7 @@ def process(self):
                 for line in lines:
                     line_image, line_coords = self.workspace.image_from_segment(
                         line, region_image, region_coords,
+                        feature_filter=self.parameter['feature_filter'],
                         transparency=self.parameter['transparency'])
                     words = line.get_Word()
                     if not words:
@@ -108,6 +114,7 @@ def process(self):
                     for word in words:
                         word_image, word_coords = self.workspace.image_from_segment(
                             word, line_image, line_coords,
+                            feature_filter=self.parameter['feature_filter'],
                             transparency=self.parameter['transparency'])
                         lpolygon_rel = coordinates_of_segment(
                             word, word_image, word_coords).tolist()
diff --git a/ocrd_segment/ocrd-tool.json b/ocrd_segment/ocrd-tool.json
index af16e84..f3cdb40 100644
--- a/ocrd_segment/ocrd-tool.json
+++ b/ocrd_segment/ocrd-tool.json
@@ -1,5 +1,5 @@
 {
-  "version": "0.1.8",
+  "version": "0.1.9",
   "git_url": "https://github.com/OCR-D/ocrd_segment",
   "tools": {
     "ocrd-segment-repair": {
@@ -131,7 +131,7 @@
         "OCR-D-GT-SEG-BLOCK"
       ],
       "output_file_grp": [
-        "OCR-D-IMG-CROP"
+        "OCR-D-IMG-PAGE"
       ],
       "steps": ["layout/analysis"],
       "parameters": {
@@ -231,10 +231,15 @@
         "OCR-D-GT-SEG-BLOCK"
       ],
       "output_file_grp": [
-        "OCR-D-IMG-CROP"
+        "OCR-D-IMG-REGION"
       ],
       "steps": ["layout/analysis"],
       "parameters": {
+        "feature_filter": {
+          "type": "string",
+          "default": "",
+          "description": "Comma-separated list of forbidden image features (e.g. `binarized,despeckled`)."
+        },
         "mimetype": {
           "type": "string",
           "enum": ["image/bmp", "application/postscript", "image/gif", "image/jpeg", "image/jp2", "image/png", "image/x-portable-pixmap", "image/tiff"],
@@ -257,10 +262,15 @@
         "OCR-D-GT-SEG-LINE"
       ],
       "output_file_grp": [
-        "OCR-D-IMG-CROP"
+        "OCR-D-IMG-LINE"
       ],
       "steps": ["layout/analysis"],
       "parameters": {
+        "feature_filter": {
+          "type": "string",
+          "default": "",
+          "description": "Comma-separated list of forbidden image features (e.g. `binarized,despeckled`)."
+        },
         "mimetype": {
           "type": "string",
           "enum": ["image/bmp", "application/postscript", "image/gif", "image/jpeg", "image/jp2", "image/png", "image/x-portable-pixmap", "image/tiff"],
@@ -283,10 +293,15 @@
         "OCR-D-GT-SEG-WORD"
       ],
       "output_file_grp": [
-        "OCR-D-IMG-CROP"
+        "OCR-D-IMG-WORD"
       ],
       "steps": ["layout/analysis"],
       "parameters": {
+        "feature_filter": {
+          "type": "string",
+          "default": "",
+          "description": "Comma-separated list of forbidden image features (e.g. `binarized,despeckled`)."
+        },
         "mimetype": {
           "type": "string",
           "enum": ["image/bmp", "application/postscript", "image/gif", "image/jpeg", "image/jp2", "image/png", "image/x-portable-pixmap", "image/tiff"],
@@ -309,10 +324,15 @@
         "OCR-D-GT-SEG-GLYPH"
       ],
       "output_file_grp": [
-        "OCR-D-IMG-CROP"
+        "OCR-D-IMG-GLYPH"
       ],
       "steps": ["layout/analysis"],
       "parameters": {
+        "feature_filter": {
+          "type": "string",
+          "default": "",
+          "description": "Comma-separated list of forbidden image features (e.g. `binarized,despeckled`)."
+        },
         "mimetype": {
           "type": "string",
           "enum": ["image/bmp", "application/postscript", "image/gif", "image/jpeg", "image/jp2", "image/png", "image/x-portable-pixmap", "image/tiff"],
@@ -336,20 +356,19 @@
         "OCR-D-OCR"
       ],
       "output_file_grp": [
-        "OCR-D-SEG-CROP",
-        "OCR-D-IMG-CROP"
+        "OCR-D-SEG-CROP"
       ],
       "steps": ["layout/analysis"],
       "parameters": {
           "feature_selector": {
               "type": "string",
               "default": "",
-              "description": "comma-separated list of required image features (e.g. binarized,despeckled)"
+              "description": "Comma-separated list of required image features (e.g. `binarized,despeckled`)"
           },
           "feature_filter": {
               "type": "string",
               "default": "",
-              "description": "comma-separated list of forbidden image features (e.g. binarized,despeckled)"
+              "description": "Comma-separated list of forbidden image features (e.g. `binarized,despeckled`)"
           },
           "transform_coordinates": {
               "type": "boolean",
@@ -368,8 +387,8 @@
         "OCR-D-OCR"
       ],
       "output_file_grp": [
-        "OCR-D-SEG-CROP",
-        "OCR-D-IMG-CROP"
+        "OCR-D-SEG-LINE",
+        "OCR-D-OCR"
       ],
       "steps": ["layout/analysis"],
       "parameters": {