|
| 1 | +import logging |
1 | 2 | from textractor.data.constants import (
|
2 | 3 | LAYOUT_FIGURE,
|
3 | 4 | LAYOUT_LIST,
|
|
13 | 14 |
|
14 | 15 | def converter(response):
|
15 | 16 | blocks_to_delete = []
|
16 |
| - page_block = None |
17 |
| - for i, block in enumerate(response["Blocks"]): |
18 |
| - if block.get("BlockType") == "PAGE": |
19 |
| - page_block = block |
20 |
| - elif block.get("BlockType", "").startswith("LAYOUT_FIGURE_"): |
21 |
| - block["BlockType"] = LAYOUT_TEXT |
22 |
| - elif ( |
23 |
| - block.get("BlockType", "").startswith("LAYOUT_") and |
24 |
| - block.get("BlockType") not in [ |
25 |
| - LAYOUT_TEXT, |
26 |
| - LAYOUT_TITLE, |
27 |
| - LAYOUT_HEADER, |
28 |
| - LAYOUT_FOOTER, |
29 |
| - LAYOUT_SECTION_HEADER, |
30 |
| - LAYOUT_PAGE_NUMBER, |
31 |
| - LAYOUT_LIST, |
32 |
| - LAYOUT_FIGURE, |
33 |
| - LAYOUT_TABLE, |
34 |
| - LAYOUT_KEY_VALUE, |
35 |
| - ] |
36 |
| - ): |
37 |
| - block["BlockType"] = LAYOUT_FIGURE |
38 |
| - elif block.get("BlockType") == LAYOUT_FIGURE and "CONTAINER" in block.get("EntityTypes", []): |
39 |
| - blocks_to_delete.append((i, block)) |
40 |
| - |
41 |
| - page_relationships = [] |
42 |
| - for relationship in page_block["Relationships"]: |
43 |
| - if relationship["Type"] == "CHILD": |
44 |
| - page_relationships = relationship["Ids"] |
45 |
| - break |
| 17 | + page_blocks = [] |
| 18 | + try: |
| 19 | + for i, block in enumerate(response["Blocks"]): |
| 20 | + if block.get("BlockType") == "PAGE": |
| 21 | + page_blocks.append(block) |
| 22 | + elif block.get("BlockType", "").startswith("LAYOUT_FIGURE_"): |
| 23 | + block["BlockType"] = LAYOUT_TEXT |
| 24 | + elif ( |
| 25 | + block.get("BlockType", "").startswith("LAYOUT_") and |
| 26 | + block.get("BlockType") not in [ |
| 27 | + LAYOUT_TEXT, |
| 28 | + LAYOUT_TITLE, |
| 29 | + LAYOUT_HEADER, |
| 30 | + LAYOUT_FOOTER, |
| 31 | + LAYOUT_SECTION_HEADER, |
| 32 | + LAYOUT_PAGE_NUMBER, |
| 33 | + LAYOUT_LIST, |
| 34 | + LAYOUT_FIGURE, |
| 35 | + LAYOUT_TABLE, |
| 36 | + LAYOUT_KEY_VALUE, |
| 37 | + ] |
| 38 | + ): |
| 39 | + block["BlockType"] = LAYOUT_FIGURE |
| 40 | + elif block.get("BlockType") == LAYOUT_FIGURE and "CONTAINER" in block.get("EntityTypes", []): |
| 41 | + blocks_to_delete.append((i, block)) |
46 | 42 |
|
47 |
| - for i, block in blocks_to_delete[::-1]: |
48 |
| - del response["Blocks"][i] |
49 |
| - page_relationships.remove(block["Id"]) |
| 43 | + blocks_to_delete_id_set = set([b["Id"] for _, b in blocks_to_delete]) |
| 44 | + for page_block in page_blocks: |
| 45 | + for relationship in page_block.get("Relationships", []): |
| 46 | + if relationship["Type"] == "CHILD": |
| 47 | + relationship["Ids"] = [ |
| 48 | + id |
| 49 | + for id in relationship["Ids"] |
| 50 | + if id not in blocks_to_delete_id_set |
| 51 | + ] |
| 52 | + break |
| 53 | + |
| 54 | + for i, block in blocks_to_delete[::-1]: |
| 55 | + del response["Blocks"][i] |
| 56 | + except Exception as ex: |
| 57 | + logging.warning(f"Failed to convert the response for backward compatibility. {str(ex)}") |
50 | 58 |
|
51 | 59 | return response
|
0 commit comments