Skip to content

Commit

Permalink
fixed bboxes
Browse files Browse the repository at this point in the history
  • Loading branch information
alexander1999-hub committed Nov 7, 2024
1 parent 9497dba commit 8b6238e
Show file tree
Hide file tree
Showing 3 changed files with 102 additions and 10 deletions.
2 changes: 1 addition & 1 deletion dedoc/readers/pdf_reader/pdf_base_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -161,7 +161,7 @@ def _process_document_with_gost_frame(self, images: Iterator[ndarray], first_pag
page_range = range(first_page, first_page + len(gost_analyzed_images))
gost_analyzed_images = dict(zip(page_range, gost_analyzed_images))
if isinstance(self, PdfTxtlayerReader):
self.gost_frame_boxes = dict(zip(page_range, [item[1] for item in gost_analyzed_images.values()]))
self.gost_frame_boxes = dict(zip(page_range, [(item[1], item[2]) for item in gost_analyzed_images.values()]))
result = Parallel(n_jobs=self.config["n_jobs"])(
delayed(self._process_one_page)(image, parameters, page_number, path) for page_number, (image, box, original_image_shape) in
gost_analyzed_images.items()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
from numpy import ndarray

from dedoc.readers.pdf_reader.data_classes.line_with_location import LineWithLocation
from dedoc.readers.pdf_reader.data_classes.page_with_bboxes import PageWithBBox
from dedoc.readers.pdf_reader.data_classes.pdf_image_attachment import PdfImageAttachment
from dedoc.readers.pdf_reader.data_classes.tables.scantable import ScanTable
from dedoc.readers.pdf_reader.pdf_base_reader import ParametersForParseDoc, PdfBaseReader
Expand Down Expand Up @@ -60,18 +59,23 @@ def _process_one_page(self,
if page is None:
return [], [], [], []
if parameters.need_gost_frame_analysis:
page_shift = self.gost_frame_boxes[page_number]
self._move_table_cells(tables=tables, page_shift=page_shift, page=page)
page_shift = self.gost_frame_boxes[page_number][0]
self._move_table_cells(tables=tables, page_shift=page_shift, page=self.gost_frame_boxes[page_number][1])
self.__change_table_boxes_page_width_heigth(pdf_width=page.pdf_page_width, pdf_height=page.pdf_page_height, tables=tables)
readable_block = page_shift # bbox representing the content of the gost frame
page.bboxes = [bbox for bbox in page.bboxes if self._inside_any_unreadable_block(bbox.bbox, [readable_block])] # exclude boxes outside the frame
unreadable_blocks = [location.bbox for table in tables for location in table.locations]
page.bboxes = [bbox for bbox in page.bboxes if not self._inside_any_unreadable_block(bbox.bbox, unreadable_blocks)]
lines = self.metadata_extractor.extract_metadata_and_set_annotations(page_with_lines=page, call_classifier=False)
self.__change_table_boxes_page_width_heigth(pdf_width=page.pdf_page_width, pdf_height=page.pdf_page_height, tables=tables)
unreadable_blocks = [location.bbox for table in tables for location in table.locations]
page.bboxes = [bbox for bbox in page.bboxes if not self._inside_any_unreadable_block(bbox.bbox, unreadable_blocks)]
lines = self.metadata_extractor.extract_metadata_and_set_annotations(page_with_lines=page, call_classifier=False)
else:
unreadable_blocks = [location.bbox for table in tables for location in table.locations]
page.bboxes = [bbox for bbox in page.bboxes if not self._inside_any_unreadable_block(bbox.bbox, unreadable_blocks)]
lines = self.metadata_extractor.extract_metadata_and_set_annotations(page_with_lines=page, call_classifier=False)
self.__change_table_boxes_page_width_heigth(pdf_width=page.pdf_page_width, pdf_height=page.pdf_page_height, tables=tables)

return lines, tables, page.attachments, []

def _move_table_cells(self, tables: List[ScanTable], page_shift: BBox, page: PageWithBBox) -> None:
def _move_table_cells(self, tables: List[ScanTable], page_shift: BBox, page: Tuple) -> None:
"""
Move tables back to original coordinates when parsing a document containing a gost frame
"""
Expand All @@ -81,7 +85,7 @@ def _move_table_cells(self, tables: List[ScanTable], page_shift: BBox, page: Pag
location.bbox.shift(shift_x=shift_x, shift_y=shift_y)
for row in table.matrix_cells:
for cell in row:
image_width, image_height = page.pdf_page_width, page.pdf_page_height
image_width, image_height = page[1], page[0]
cell.shift(shift_x=shift_x, shift_y=shift_y, image_width=image_width, image_height=image_height)

def __change_table_boxes_page_width_heigth(self, pdf_width: int, pdf_height: int, tables: List[ScanTable]) -> None:
Expand Down
88 changes: 88 additions & 0 deletions tests/api_tests/test_api_module_table_recognizer.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import json
import os
import unittest
from typing import List
Expand Down Expand Up @@ -214,22 +215,91 @@ def test_detect_small_table(self) -> None:
tables = result["content"]["tables"]
self.assertEqual(2, len(tables))

def _test_bbox_annotations(self, node: dict, target_dict: dict) -> None:
annotations = [annotation for annotation in node["annotations"] if annotation["name"] == "bounding box"]
annotations_dict = json.loads(annotations[0]["value"])
for key in target_dict:
self.assertAlmostEqual(float(annotations_dict[key]), target_dict[key], None, None, delta=0.05)

def test_multipage_gost_table(self) -> None:
file_name = "gost_multipage_table.pdf"
result = self._send_request(file_name, data={"need_gost_frame_analysis": "True"}) # don't pass pdf_with_text_layer to check condition in PDFBaseReader
self.assertTrue(len(result["content"]["tables"][0]["cells"]) > 35)
self.assertTrue("KR13" in result["content"]["tables"][0]["cells"][-1][0]["lines"][0]["text"]) # check the last row of multipage table
target_bbox_dict_1 = {
"x_top_left": 0.15,
"y_top_left": 0.58,
"width": 0.04,
"height": 0.009,
"page_width": 1653,
"page_height": 2339
}
self._test_bbox_annotations(result["content"]["tables"][0]["cells"][-1][0]["lines"][0], target_bbox_dict_1)
self.assertTrue("R13.1" in result["content"]["tables"][0]["cells"][-1][1]["lines"][0]["text"]) # check that it belongs to first and only table
target_bbox_dict_2 = {
"x_top_left": 0.25,
"y_top_left": 0.58,
"width": 0.04,
"height": 0.009,
"page_width": 1653,
"page_height": 2339
}
self._test_bbox_annotations(result["content"]["tables"][0]["cells"][-1][1]["lines"][0], target_bbox_dict_2)
self.assertTrue("Испытание по проверке" in result["content"]["tables"][0]["cells"][-1][2]["lines"][0]["text"])
target_bbox_dict_3 = {
"x_top_left": 0.33,
"y_top_left": 0.58,
"width": 0.09,
"height": 0.009,
"page_width": 1653,
"page_height": 2339
}
self._test_bbox_annotations(result["content"]["tables"][0]["cells"][-1][2]["lines"][0], target_bbox_dict_3)
self.assertTrue("3.6" in result["content"]["tables"][0]["cells"][-1][3]["lines"][0]["text"])
target_bbox_dict_4 = {
"x_top_left": 0.78,
"y_top_left": 0.58,
"width": 0.02,
"height": 0.009,
"page_width": 1653,
"page_height": 2339
}
self._test_bbox_annotations(result["content"]["tables"][0]["cells"][-1][3]["lines"][0], target_bbox_dict_4)
self.assertTrue("7.4.9" in result["content"]["tables"][0]["cells"][-1][4]["lines"][0]["text"])
target_bbox_dict_5 = {
"x_top_left": 0.88,
"y_top_left": 0.58,
"width": 0.03,
"height": 0.009,
"page_width": 1653,
"page_height": 2339
}
self._test_bbox_annotations(result["content"]["tables"][0]["cells"][-1][4]["lines"][0], target_bbox_dict_5)

def test_multipage_gost_table_with_text_layer(self) -> None:
file_name = "gost_multipage_table_2.pdf"
result = self._send_request(file_name, data={"need_gost_frame_analysis": "True", "pdf_with_text_layer": "True"})
self.assertEqual(len(result["content"]["tables"][0]["cells"]), 14)
self.assertTrue("SAMPLE TEXT" in result["content"]["tables"][0]["cells"][0][0]["lines"][0]["text"])
target_bbox_dict_1 = {
"x_top_left": 0.13,
"y_top_left": 0.61,
"width": 0.06,
"height": 0.007,
"page_width": 595,
"page_height": 841
}
self._test_bbox_annotations(result["content"]["tables"][0]["cells"][0][0]["lines"][0], target_bbox_dict_1)
self.assertTrue("2" in result["content"]["tables"][0]["cells"][-1][0]["lines"][0]["text"])
target_bbox_dict_2 = {
"x_top_left": 0.13,
"y_top_left": 0.15,
"width": 0.005,
"height": 0.007,
"page_width": 595,
"page_height": 841
}
self._test_bbox_annotations(result["content"]["tables"][0]["cells"][-1][0]["lines"][0], target_bbox_dict_2)
self.assertEqual(len(result["content"]["tables"]), 1)

def test_multipage_gost_table_with_text_layer_and_pages_param(self) -> None:
Expand All @@ -238,4 +308,22 @@ def test_multipage_gost_table_with_text_layer_and_pages_param(self) -> None:
self.assertEqual(len(result["content"]["tables"]), 1)
self.assertEqual(len(result["content"]["tables"][0]["cells"]), 5)
self.assertTrue("SAMPLE TEXT" in result["content"]["tables"][0]["cells"][0][0]["lines"][0]["text"])
target_bbox_dict_1 = {
"x_top_left": 0.13,
"y_top_left": 0.07,
"width": 0.06,
"height": 0.007,
"page_width": 595,
"page_height": 841
}
self._test_bbox_annotations(result["content"]["tables"][0]["cells"][0][0]["lines"][0], target_bbox_dict_1)
self.assertTrue("2" in result["content"]["tables"][0]["cells"][-1][0]["lines"][0]["text"])
target_bbox_dict_2 = {
"x_top_left": 0.13,
"y_top_left": 0.15,
"width": 0.005,
"height": 0.007,
"page_width": 595,
"page_height": 841
}
self._test_bbox_annotations(result["content"]["tables"][0]["cells"][-1][0]["lines"][0], target_bbox_dict_2)

0 comments on commit 8b6238e

Please sign in to comment.