Skip to content

Commit 2f4e26b

Browse files
committed
Ignore KV elements in LAYOUT_LIST
1 parent 7218d8e commit 2f4e26b

File tree

2 files changed

+3
-3
lines changed

2 files changed

+3
-3
lines changed

textractor/entities/layout.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -139,7 +139,7 @@ def get_text_and_words(
139139
final_text = add_id_to_html_tag(config.list_layout_prefix, self.id, config)
140140
final_words = []
141141
for i, child in enumerate(
142-
sorted(self.children, key=lambda x: x.reading_order)
142+
sorted(filter(lambda c: isinstance(c, Layout), self.children), key=lambda x: x.reading_order)
143143
):
144144
child_text, child_words = child.get_text_and_words(config)
145145
child_prefix = add_id_to_html_tag(config.list_element_prefix, child.id, config)

textractor/parsers/response_parser.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1363,15 +1363,15 @@ def parse_document_api_response(response: dict) -> Document:
13631363
# Using the kv_added returned by _create_table_objects, we try to match the remaining KVs
13641364
# to existing layout elements.
13651365
for layout in sorted(page.layouts, key=lambda x: x.bbox.y):
1366-
if layout.layout_type == LAYOUT_ENTITY:
1366+
if layout.layout_type in LAYOUT_ENTITY:
13671367
continue
13681368
for kv in sorted(key_values, key=lambda x: x.bbox.y):
13691369
if (
13701370
layout.bbox.get_intersection(kv.bbox).area > THRESHOLD * kv.bbox.area
13711371
and kv.id not in kv_added
13721372
):
13731373
# Ignore if the KV is already overlapping with a table
1374-
if any([w.cell_id for w in kv.words]):
1374+
if any([w.cell_id for w in kv.words]) or layout.layout_type == LAYOUT_LIST:
13751375
kv_added.add(kv.id)
13761376
continue
13771377
# Removing the duplicate words

0 commit comments

Comments
 (0)