Skip to content

Commit

Permalink
Fix converter raising an exception on blank pages
Browse files Browse the repository at this point in the history
  • Loading branch information
Belval committed Nov 13, 2024
1 parent 5a7a39b commit 0db2f94
Showing 1 changed file with 36 additions and 32 deletions.
68 changes: 36 additions & 32 deletions textractor/utils/legacy_utils.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import logging
from textractor.data.constants import (
LAYOUT_FIGURE,
LAYOUT_LIST,
Expand All @@ -14,38 +15,41 @@
def converter(response):
blocks_to_delete = []
page_block = None
for i, block in enumerate(response["Blocks"]):
if block.get("BlockType") == "PAGE":
page_block = block
elif block.get("BlockType", "").startswith("LAYOUT_FIGURE_"):
block["BlockType"] = LAYOUT_TEXT
elif (
block.get("BlockType", "").startswith("LAYOUT_") and
block.get("BlockType") not in [
LAYOUT_TEXT,
LAYOUT_TITLE,
LAYOUT_HEADER,
LAYOUT_FOOTER,
LAYOUT_SECTION_HEADER,
LAYOUT_PAGE_NUMBER,
LAYOUT_LIST,
LAYOUT_FIGURE,
LAYOUT_TABLE,
LAYOUT_KEY_VALUE,
]
):
block["BlockType"] = LAYOUT_FIGURE
elif block.get("BlockType") == LAYOUT_FIGURE and "CONTAINER" in block.get("EntityTypes", []):
blocks_to_delete.append((i, block))

page_relationships = []
for relationship in page_block["Relationships"]:
if relationship["Type"] == "CHILD":
page_relationships = relationship["Ids"]
break
try:
for i, block in enumerate(response["Blocks"]):
if block.get("BlockType") == "PAGE":
page_block = block
elif block.get("BlockType", "").startswith("LAYOUT_FIGURE_"):
block["BlockType"] = LAYOUT_TEXT
elif (
block.get("BlockType", "").startswith("LAYOUT_") and
block.get("BlockType") not in [
LAYOUT_TEXT,
LAYOUT_TITLE,
LAYOUT_HEADER,
LAYOUT_FOOTER,
LAYOUT_SECTION_HEADER,
LAYOUT_PAGE_NUMBER,
LAYOUT_LIST,
LAYOUT_FIGURE,
LAYOUT_TABLE,
LAYOUT_KEY_VALUE,
]
):
block["BlockType"] = LAYOUT_FIGURE
elif block.get("BlockType") == LAYOUT_FIGURE and "CONTAINER" in block.get("EntityTypes", []):
blocks_to_delete.append((i, block))

for i, block in blocks_to_delete[::-1]:
del response["Blocks"][i]
page_relationships.remove(block["Id"])
page_relationships = []
for relationship in page_block.get("Relationships", []):
if relationship["Type"] == "CHILD":
page_relationships = relationship["Ids"]
break

for i, block in blocks_to_delete[::-1]:
del response["Blocks"][i]
page_relationships.remove(block["Id"])
except Exception as ex:
logging.warning(f"Failed to convert the response for backward compatibility. {str(ex)}")

return response

0 comments on commit 0db2f94

Please sign in to comment.