qurator-spk · vahidrezanezhad · Apr 22, 2021 · Apr 13, 2021 · Apr 13, 2021 · Apr 13, 2021
diff --git a/ocrd-tool.json b/ocrd-tool.json
@@ -0,0 +1 @@
+qurator/eynollah/ocrd-tool.json
diff --git a/qurator/eynollah/cli.py b/qurator/eynollah/cli.py
@@ -117,20 +117,19 @@ def main(
         print("Error: You used -ep to enable plotting but set none of -sl, -sd, -sa or -si")
         sys.exit(1)
     eynollah = Eynollah(
-        image,
-        None,
-        out,
-        model,
-        save_images,
-        save_layout,
-        save_deskewed,
-        save_all,
-        enable_plotting,
-        allow_enhancement,
-        curved_line,
-        full_layout,
-        allow_scaling,
-        headers_off,
+        image_filename=image,
+        dir_out=out,
+        dir_models=model,
+        dir_of_cropped_images=save_images,
+        dir_of_layout=save_layout,
+        dir_of_deskewed=save_deskewed,
+        dir_of_all=save_all,
+        enable_plotting=enable_plotting,
+        allow_enhancement=allow_enhancement,
+        curved_line=curved_line,
+        full_layout=full_layout,
+        allow_scaling=allow_scaling,
+        headers_off=headers_off,
     )
     pcgts = eynollah.run()
     eynollah.writer.write_pagexml(pcgts)

diff --git a/qurator/eynollah/eynollah.py b/qurator/eynollah/eynollah.py
@@ -65,7 +65,7 @@
     order_of_regions,
     find_number_of_columns_in_document,
     return_boxes_of_images_by_order_of_reading_new)
-from .utils.pil_cv2 import check_dpi
+from .utils.pil_cv2 import check_dpi, pil2cv
 from .utils.xml import order_and_id_of_texts
 from .plot import EynollahPlotter
 from .writer import EynollahXmlWriter
@@ -79,10 +79,11 @@
 class Eynollah:
     def __init__(
         self,
-        image_filename,
-        image_filename_stem,
-        dir_out,
         dir_models,
+        image_filename,
+        image_pil=None,
+        image_filename_stem=None,
+        dir_out=None,
         dir_of_cropped_images=None,
         dir_of_layout=None,
         dir_of_deskewed=None,
@@ -92,30 +93,35 @@ def __init__(
         curved_line=False,
         full_layout=False,
         allow_scaling=False,
-        headers_off=False
+        headers_off=False,
+        override_dpi=None,
+        logger=None,
+        pcgts=None,
     ):
+        if image_pil:
+            self._imgs = self._cache_images(image_pil=image_pil)
+        else:
+            self._imgs = self._cache_images(image_filename=image_filename)
         self.image_filename = image_filename
         self.dir_out = dir_out
-        self.image_filename_stem = image_filename_stem
         self.allow_enhancement = allow_enhancement
         self.curved_line = curved_line
         self.full_layout = full_layout
         self.allow_scaling = allow_scaling
         self.headers_off = headers_off
-        if not self.image_filename_stem:
-            self.image_filename_stem = Path(Path(image_filename).name).stem
+        self.override_dpi = override_dpi
         self.plotter = None if not enable_plotting else EynollahPlotter(
             dir_of_all=dir_of_all,
             dir_of_deskewed=dir_of_deskewed,
             dir_of_cropped_images=dir_of_cropped_images,
             dir_of_layout=dir_of_layout,
-            image_filename=image_filename,
-            image_filename_stem=self.image_filename_stem)
+            image_filename_stem=Path(Path(image_filename).name).stem)
         self.writer = EynollahXmlWriter(
             dir_out=self.dir_out,
             image_filename=self.image_filename,
-            curved_line=self.curved_line)
-        self.logger = getLogger('eynollah')
+            curved_line=self.curved_line,
+            pcgts=pcgts)
+        self.logger = logger if logger else getLogger('eynollah')
         self.dir_models = dir_models
 
         self.model_dir_of_enhancement = dir_models + "/model_enhancement.h5"
@@ -128,24 +134,26 @@ def __init__(
         self.model_region_dir_p_ens = dir_models + "/model_ensemble_s.h5"
         self.model_textline_dir = dir_models + "/model_textline_newspapers.h5"
 
-        self._imgs = {}
+    def _cache_images(self, image_filename=None, image_pil=None):
+        ret = {}
+        if image_filename:
+            ret['img'] = cv2.imread(image_filename)
+        else:
+            ret['img'] = pil2cv(image_pil)
+        ret['img_grayscale'] = cv2.cvtColor(ret['img'], cv2.COLOR_BGR2GRAY)
+        for prefix in ('',  '_grayscale'):
+            ret[f'img{prefix}_uint8'] = ret[f'img{prefix}'].astype(np.uint8)
+        return ret
 
     def imread(self, grayscale=False, uint8=True):
         key = 'img'
         if grayscale:
             key += '_grayscale'
         if uint8:
             key += '_uint8'
-        if key not in self._imgs:
-            if grayscale:
-                img = cv2.imread(self.image_filename, cv2.IMREAD_GRAYSCALE)
-            else:
-                img = cv2.imread(self.image_filename)
-            if uint8:
-                img = img.astype(np.uint8)
-            self._imgs[key] = img
         return self._imgs[key].copy()
 
+
     def predict_enhancement(self, img):
         self.logger.debug("enter predict_enhancement")
         model_enhancement, session_enhancement = self.start_new_session_and_model(self.model_dir_of_enhancement)
@@ -346,10 +354,9 @@ def resize_image_with_column_classifier(self, is_image_enhanced):
 
     def resize_and_enhance_image_with_column_classifier(self):
         self.logger.debug("enter resize_and_enhance_image_with_column_classifier")
-        try:
-            dpi = check_dpi(self.image_filename)
-        except:
-            dpi = 230
+        if self.override_dpi:
+            return self.override_dpi
+        dpi = check_dpi(self.imread())
         self.logger.info("Detected %s DPI", dpi)
         img = self.imread()
 
@@ -1446,7 +1453,6 @@ def run_enhancement(self):
         scale = 1
         if is_image_enhanced:
             if self.allow_enhancement:
-                cv2.imwrite(os.path.join(self.dir_out, self.image_filename_stem) + ".tif", img_res)
                 img_res = img_res.astype(np.uint8)
                 self.get_image_and_scales(img_org, img_res, scale)
             else:

diff --git a/qurator/eynollah/ocrd-tool.json b/qurator/eynollah/ocrd-tool.json
@@ -0,0 +1,54 @@
+{
+  "version": "0.0.1",
+  "git_url": "https://github.com/qurator-spk/eynollah",
+  "tools": {
+    "ocrd-eynollah-segment": {
+      "executable": "ocrd-eynollah-segment",
+      "categories": ["Layout analysis"],
+      "description": "Segment page into regions and lines and do reading order detection with eynollah",
+      "input_file_grp": ["OCR-D-IMG", "OCR-D-SEG-PAGE", "OCR-D-GT-SEG-PAGE"],
+      "output_file_grp": ["OCR-D-SEG-LINE"],
+      "steps": ["layout/segmentation/region", "layout/segmentation/line"],
+      "parameters": {
+        "models": {
+          "type": "string",
+          "format": "file",
+          "cacheable": true,
+          "description": "Path to directory containing models to be used (See https://qurator-data.de/eynollah)",
+          "required": true
+        },
+        "dpi": {
+          "type": "number",
+          "format": "float",
+          "description": "pixel density in dots per inch (overrides any meta-data in the images); ignored if <= 0 (with fall-back 230)",
+          "default": 0
+        },
+        "full_layout": {
+          "type": "boolean",
+          "default": true,
+          "description": "Try to detect all element subtypes, including drop-caps and headings"
+        },
+        "curved_line": {
+          "type": "boolean",
+          "default": false,
+          "description": "try to return contour of textlines instead of just rectangle bounding box. Needs more processing time"
+        },
+        "allow_enhancement": {
+          "type": "boolean",
+          "default": true,
+          "description": "if the input image has less than 300 DPI, then upscale and enhance"
+        },
+        "allow_scaling": {
+          "type": "boolean",
+          "default": false,
+          "description": "if this parameter set to true, this tool would check the scale and if needed it will scale it to perform better layout detection"
+        },
+        "headers_off": {
+          "type": "boolean",
+          "default": false,
+          "description": "ignore the special role of headings during reading order detection"
+        }
+      }
+    }
+  }
+}
diff --git a/qurator/eynollah/ocrd_cli.py b/qurator/eynollah/ocrd_cli.py
@@ -0,0 +1,11 @@
+from .processor import EynollahProcessor
+from click import command
+from ocrd.decorators import ocrd_cli_options, ocrd_cli_wrap_processor
+
+@command()
+@ocrd_cli_options
+def main(*args, **kwargs):
+    return ocrd_cli_wrap_processor(EynollahProcessor, *args, **kwargs)
+
+if __name__ == '__main__':
+    main()
diff --git a/qurator/eynollah/plot.py b/qurator/eynollah/plot.py
@@ -21,7 +21,6 @@ def __init__(
         dir_of_deskewed,
         dir_of_layout,
         dir_of_cropped_images,
-        image_filename,
         image_filename_stem,
         image_org=None,
         scale_x=1,
@@ -31,7 +30,6 @@ def __init__(
         self.dir_of_layout = dir_of_layout
         self.dir_of_cropped_images = dir_of_cropped_images
         self.dir_of_deskewed = dir_of_deskewed
-        self.image_filename = image_filename
         self.image_filename_stem = image_filename_stem
         # XXX TODO hacky these cannot be set at init time
         self.image_org = image_org

diff --git a/qurator/eynollah/processor.py b/qurator/eynollah/processor.py
@@ -0,0 +1,59 @@
+from json import loads
+from pkg_resources import resource_string
+from tempfile import NamedTemporaryFile
+from os.path import join
+
+from ocrd import Processor
+from ocrd_modelfactory import page_from_file
+from ocrd_models.ocrd_page import to_xml
+from ocrd_utils import (
+    getLogger,
+    MIMETYPE_PAGE,
+    assert_file_grp_cardinality,
+    make_file_id
+)
+
+from .eynollah import Eynollah
+from .utils.pil_cv2 import pil2cv
+
+OCRD_TOOL = loads(resource_string(__name__, 'ocrd-tool.json').decode('utf8'))
+
+class EynollahProcessor(Processor):
+
+    def __init__(self, *args, **kwargs):
+        kwargs['ocrd_tool'] = OCRD_TOOL['tools']['ocrd-eynollah-segment']
+        kwargs['version'] = OCRD_TOOL['version']
+        super().__init__(*args, **kwargs)
+
+    def process(self):
+        LOG = getLogger('eynollah')
+        assert_file_grp_cardinality(self.input_file_grp, 1)
+        assert_file_grp_cardinality(self.output_file_grp, 1)
+        for n, input_file in enumerate(self.input_files):
+            page_id = input_file.pageId or input_file.ID
+            LOG.info("INPUT FILE %s (%d/%d) ", page_id, n + 1, len(self.input_files))
+            pcgts = page_from_file(self.workspace.download_file(input_file))
+            self.add_metadata(pcgts)
+            page = pcgts.get_Page()
+            page_image, _, _ = self.workspace.image_from_page(page, page_id, feature_filter='binarized')
+            eynollah_kwargs = {
+                'dir_models': self.resolve_resource(self.parameter['models']),
+                'allow_enhancement': self.parameter['allow_enhancement'],
+                'curved_line': self.parameter['curved_line'],
+                'full_layout': self.parameter['full_layout'],
+                'allow_scaling': self.parameter['allow_scaling'],
+                'headers_off': self.parameter['headers_off'],
+                'override_dpi': self.parameter['dpi'] if self.parameter['dpi'] > 0 else None,
+                'logger': LOG,
+                'pcgts': pcgts,
+                'image_pil': page_image,
+                'image_filename': None}
+            Eynollah(**eynollah_kwargs).run()
+            file_id = make_file_id(input_file, self.output_file_grp)
+            self.workspace.add_file(
+                ID=file_id,
+                file_grp=self.output_file_grp,
+                pageId=page_id,
+                mimetype=MIMETYPE_PAGE,
+                local_filename=join(self.output_file_grp, file_id) + '.xml',
+                content=to_xml(pcgts))
diff --git a/qurator/eynollah/utils/pil_cv2.py b/qurator/eynollah/utils/pil_cv2.py
@@ -6,19 +6,23 @@
 # from sbb_binarization
 
 def cv2pil(img):
-    return Image.fromarray(img.astype('uint8'))
+    return Image.fromarray(img)
 
 def pil2cv(img):
     # from ocrd/workspace.py
     color_conversion = COLOR_GRAY2BGR if img.mode in ('1', 'L') else  COLOR_RGB2BGR
     pil_as_np_array = np.array(img).astype('uint8') if img.mode == '1' else np.array(img)
     return cvtColor(pil_as_np_array, color_conversion)
 
-def check_dpi(image_filename):
-    exif = OcrdExif(Image.open(image_filename))
-    print(exif.to_xml())
-    resolution = exif.resolution
-    if exif.resolutionUnit == 'cm':
-        resolution /= 2.54
-    return int(resolution)
-
+def check_dpi(img):
+    try:
+        exif = OcrdExif(cv2pil(img))
+        resolution = exif.resolution
+        if resolution == 1:
+            raise Exception()
+        if exif.resolutionUnit == 'cm':
+            resolution /= 2.54
+        return int(resolution)
+    except Exception as e:
+        print(e)
+        return 230
diff --git a/qurator/eynollah/writer.py b/qurator/eynollah/writer.py
@@ -28,14 +28,17 @@ def __init__(self, *, dir_out, image_filename, curved_line, pcgts=None):
         self.counter = EynollahIdCounter()
         self.dir_out = dir_out
         self.image_filename = image_filename
-        self.image_filename_stem = Path(Path(image_filename).name).stem
         self.curved_line = curved_line
-        self.pcgts = pcgts if pcgts else PcGtsType()
+        self.pcgts = pcgts
         self.scale_x = None # XXX set outside __init__
         self.scale_y = None # XXX set outside __init__
         self.height_org = None # XXX set outside __init__
         self.width_org = None # XXX set outside __init__
 
+    @property
+    def image_filename_stem(self):
+        return Path(Path(self.image_filename).name).stem
+
     def calculate_page_coords(self, cont_page):
         self.logger.debug('enter calculate_page_coords')
         points_page_print = ""
@@ -87,7 +90,7 @@ def serialize_lines_in_marginal(self, marginal_region, all_found_texline_polygon
                         points_co += str(int((all_found_texline_polygons_marginals[marginal_idx][j][l][0][0] + all_box_coord_marginals[marginal_idx][2] + page_coord[2]) / self.scale_x))
                         points_co += ','
                         points_co += str(int((all_found_texline_polygons_marginals[marginal_idx][j][l][0][1] + all_box_coord_marginals[marginal_idx][0] + page_coord[0]) / self.scale_y))
-                points += ' '
+                points_co += ' '
             coords.set_points(points_co[:-1])
 
     def serialize_lines_in_region(self, text_region, all_found_texline_polygons, region_idx, page_coord, all_box_coord, slopes, counter):
@@ -141,7 +144,7 @@ def build_pagexml_no_full_layout(self, found_polygons_text_region, page_coord, o
         self.logger.debug('enter build_pagexml_no_full_layout')
 
         # create the file structure
-        pcgts = create_page_xml(self.image_filename, self.height_org, self.width_org)
+        pcgts = self.pcgts if self.pcgts else create_page_xml(self.image_filename, self.height_org, self.width_org)
         page = pcgts.get_Page()
         page.set_Border(BorderType(Coords=CoordsType(points=self.calculate_page_coords(cont_page))))
 
@@ -181,7 +184,7 @@ def build_pagexml_full_layout(self, found_polygons_text_region, found_polygons_t
         self.logger.debug('enter build_pagexml_full_layout')
 
         # create the file structure
-        pcgts = create_page_xml(self.image_filename, self.height_org, self.width_org)
+        pcgts = self.pcgts if self.pcgts else create_page_xml(self.image_filename, self.height_org, self.width_org)
         page = pcgts.get_Page()
         page.set_Border(BorderType(Coords=CoordsType(points=self.calculate_page_coords(cont_page))))