Skip to content

Commit 189e31c

Browse files
authored
Merge branch 'aws-samples:master' into master
2 parents ccb1c47 + 9fb7d22 commit 189e31c

File tree

4 files changed

+44
-36
lines changed

4 files changed

+44
-36
lines changed

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -127,7 +127,7 @@ Textractor can be cited using:
127127
author = {Belval, Edouard and Delteil, Thomas and Schade, Martin and Radhakrishna, Srividhya},
128128
title = {{Amazon Textractor}},
129129
url = {https://github.com/aws-samples/amazon-textract-textractor},
130-
version = {1.8.4},
130+
version = {1.8.5},
131131
year = {2024}
132132
}
133133
```

setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,7 @@ def _run(self, command):
4141
setup(
4242
# include data files
4343
name="amazon-textract-textractor",
44-
version="1.8.4",
44+
version="1.8.5",
4545
license="Apache 2.0",
4646
description="A package to use AWS Textract services.",
4747
url="https://github.com/aws-samples/amazon-textract-textractor",

textractor/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,3 @@
1-
__version__ = "1.8.4"
1+
__version__ = "1.8.5"
22

33
from .textractor import Textractor

textractor/utils/legacy_utils.py

Lines changed: 41 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import logging
12
from textractor.data.constants import (
23
LAYOUT_FIGURE,
34
LAYOUT_LIST,
@@ -13,39 +14,46 @@
1314

1415
def converter(response):
1516
blocks_to_delete = []
16-
page_block = None
17-
for i, block in enumerate(response["Blocks"]):
18-
if block.get("BlockType") == "PAGE":
19-
page_block = block
20-
elif block.get("BlockType", "").startswith("LAYOUT_FIGURE_"):
21-
block["BlockType"] = LAYOUT_TEXT
22-
elif (
23-
block.get("BlockType", "").startswith("LAYOUT_") and
24-
block.get("BlockType") not in [
25-
LAYOUT_TEXT,
26-
LAYOUT_TITLE,
27-
LAYOUT_HEADER,
28-
LAYOUT_FOOTER,
29-
LAYOUT_SECTION_HEADER,
30-
LAYOUT_PAGE_NUMBER,
31-
LAYOUT_LIST,
32-
LAYOUT_FIGURE,
33-
LAYOUT_TABLE,
34-
LAYOUT_KEY_VALUE,
35-
]
36-
):
37-
block["BlockType"] = LAYOUT_FIGURE
38-
elif block.get("BlockType") == LAYOUT_FIGURE and "CONTAINER" in block.get("EntityTypes", []):
39-
blocks_to_delete.append((i, block))
40-
41-
page_relationships = []
42-
for relationship in page_block["Relationships"]:
43-
if relationship["Type"] == "CHILD":
44-
page_relationships = relationship["Ids"]
45-
break
17+
page_blocks = []
18+
try:
19+
for i, block in enumerate(response["Blocks"]):
20+
if block.get("BlockType") == "PAGE":
21+
page_blocks.append(block)
22+
elif block.get("BlockType", "").startswith("LAYOUT_FIGURE_"):
23+
block["BlockType"] = LAYOUT_TEXT
24+
elif (
25+
block.get("BlockType", "").startswith("LAYOUT_") and
26+
block.get("BlockType") not in [
27+
LAYOUT_TEXT,
28+
LAYOUT_TITLE,
29+
LAYOUT_HEADER,
30+
LAYOUT_FOOTER,
31+
LAYOUT_SECTION_HEADER,
32+
LAYOUT_PAGE_NUMBER,
33+
LAYOUT_LIST,
34+
LAYOUT_FIGURE,
35+
LAYOUT_TABLE,
36+
LAYOUT_KEY_VALUE,
37+
]
38+
):
39+
block["BlockType"] = LAYOUT_FIGURE
40+
elif block.get("BlockType") == LAYOUT_FIGURE and "CONTAINER" in block.get("EntityTypes", []):
41+
blocks_to_delete.append((i, block))
4642

47-
for i, block in blocks_to_delete[::-1]:
48-
del response["Blocks"][i]
49-
page_relationships.remove(block["Id"])
43+
blocks_to_delete_id_set = set([b["Id"] for _, b in blocks_to_delete])
44+
for page_block in page_blocks:
45+
for relationship in page_block.get("Relationships", []):
46+
if relationship["Type"] == "CHILD":
47+
relationship["Ids"] = [
48+
id
49+
for id in relationship["Ids"]
50+
if id not in blocks_to_delete_id_set
51+
]
52+
break
53+
54+
for i, block in blocks_to_delete[::-1]:
55+
del response["Blocks"][i]
56+
except Exception as ex:
57+
logging.warning(f"Failed to convert the response for backward compatibility. {str(ex)}")
5058

5159
return response

0 commit comments

Comments
 (0)