Skip to content

Commit

Permalink
Fix .to_markdown() raising an exception on missing local config
Browse files Browse the repository at this point in the history
  • Loading branch information
Belval authored Jun 24, 2024
2 parents 32b5d76 + b209eae commit 2d06c1f
Showing 1 changed file with 47 additions and 47 deletions.
94 changes: 47 additions & 47 deletions textractor/entities/table.py
Original file line number Diff line number Diff line change
Expand Up @@ -674,23 +674,24 @@ def to_html(self) -> str:
def get_text_and_words(
self, config: TextLinearizationConfig = TextLinearizationConfig()
):
local_config = deepcopy(config)
words_ = self.words
# If no text, return empty string
if not words_ and config.table_remove_column_headers:
if not words_ and local_config.table_remove_column_headers:
return "", []

# If not many words, only return text
if len(words_) < config.table_min_table_words:
if len(words_) < local_config.table_min_table_words:
return linearize_children(words_, config=config)

words = [Word(str(uuid.uuid4()), self.bbox, config.table_prefix)] if config.table_prefix else []
words = [Word(str(uuid.uuid4()), self.bbox, local_config.table_prefix)] if local_config.table_prefix else []
rows = sorted([(key, list(group)) for key, group in itertools.groupby(
self.table_cells, key=lambda cell: cell.row_index
)], key=lambda r: r[0])
processed_cells = set()
# Fill the table
row_offset = 0
if config.table_flatten_headers:
if local_config.table_flatten_headers:
columns = [[] for _ in range(len(rows[0][1]))]
columns_bbox = [[] for _ in range(len(rows[0][1]))]
for _, row in rows:
Expand All @@ -700,8 +701,8 @@ def get_text_and_words(
for i, cell in enumerate(row):
if (
cell not in processed_cells or
config.table_duplicate_text_in_merged_cells or
config.table_flatten_headers
local_config.table_duplicate_text_in_merged_cells or
local_config.table_flatten_headers
):
if cell.siblings:
# This handles the edge case where we are flattening the headers
Expand All @@ -720,21 +721,21 @@ def get_text_and_words(
_, words = cell.get_text_and_words(config)
columns[i].extend(words)
columns_bbox[i].append(cell.bbox)
elif config.table_cell_empty_cell_placeholder:
columns[i].append(Word(str(uuid.uuid4()), cell.bbox, config.table_cell_empty_cell_placeholder))
elif local_config.table_cell_empty_cell_placeholder:
columns[i].append(Word(str(uuid.uuid4()), cell.bbox, local_config.table_cell_empty_cell_placeholder))
row_offset += 1
if columns:
columns_bbox = [BoundingBox.enclosing_bbox(cbb) for cbb in columns_bbox]
if config.table_row_prefix and config.add_prefixes_and_suffixes_as_words:
words.append(Word(str(uuid.uuid4()), BoundingBox.enclosing_bbox(columns_bbox), config.table_row_prefix, is_structure=True))
if local_config.table_row_prefix and local_config.add_prefixes_and_suffixes_as_words:
words.append(Word(str(uuid.uuid4()), BoundingBox.enclosing_bbox(columns_bbox), local_config.table_row_prefix, is_structure=True))
for i, column in enumerate(columns):
words.append(
Word(
str(uuid.uuid4()),
columns_bbox[i],
config.table_cell_header_prefix
if config.table_cell_header_prefix
else config.table_cell_prefix,
local_config.table_cell_header_prefix
if local_config.table_cell_header_prefix
else local_config.table_cell_prefix,
is_structure=True
)
)
Expand All @@ -743,17 +744,17 @@ def get_text_and_words(
Word(
str(uuid.uuid4()),
columns_bbox[i],
config.table_cell_header_suffix
if config.table_cell_header_suffix
else config.table_cell_suffix,
local_config.table_cell_header_suffix
if local_config.table_cell_header_suffix
else local_config.table_cell_suffix,
is_structure=True
)
)
if config.table_row_suffix and config.add_prefixes_and_suffixes_as_words:
words.append(Word(str(uuid.uuid4()), columns_bbox, config.table_row_suffix, is_structure=True))
if local_config.table_row_suffix and local_config.add_prefixes_and_suffixes_as_words:
words.append(Word(str(uuid.uuid4()), columns_bbox, local_config.table_row_suffix, is_structure=True))
for _, cells in rows[row_offset:]:
if config.table_row_prefix and config.add_prefixes_and_suffixes_as_words:
words.append(Word(str(uuid.uuid4()), BoundingBox.enclosing_bbox(cells), config.table_row_prefix, is_structure=True))
if local_config.table_row_prefix and local_config.add_prefixes_and_suffixes_as_words:
words.append(Word(str(uuid.uuid4()), BoundingBox.enclosing_bbox(cells), local_config.table_row_prefix, is_structure=True))
for cell in sorted(cells, key=lambda c: c.col_index):
# Siblings includes the current cell
if cell.siblings:
Expand All @@ -765,35 +766,35 @@ def get_text_and_words(
row_index = first_row
row_span = last_row - first_row + 1
children = []
if (cell.col_index == first_col and cell.row_index == first_row) or config.table_duplicate_text_in_merged_cells:
if (cell.col_index == first_col and cell.row_index == first_row) or local_config.table_duplicate_text_in_merged_cells:
for sib in cell.siblings:
children.extend(sib.children)
processed_cells.add(sib)
_, cell_words = linearize_children(children, config=config, no_new_lines=True)
elif cell.row_index == first_row and config.table_cell_left_merge_cell_placeholder:
elif cell.row_index == first_row and local_config.table_cell_left_merge_cell_placeholder:
# Left-merge token
cell_words = [
Word(str(uuid.uuid4()),
cell_bbox,
config.table_cell_left_merge_cell_placeholder,
local_config.table_cell_left_merge_cell_placeholder,
is_structure=True
)
]
elif cell.col_index == first_col and config.table_cell_top_merge_cell_placeholder:
elif cell.col_index == first_col and local_config.table_cell_top_merge_cell_placeholder:
# Top-merge token
cell_words = [
Word(str(uuid.uuid4()),
cell_bbox,
config.table_cell_top_merge_cell_placeholder,
local_config.table_cell_top_merge_cell_placeholder,
is_structure=True
)
]
elif cell.col_index != first_col and cell.row_index != first_row and config.table_cell_cross_merge_cell_placeholder:
elif cell.col_index != first_col and cell.row_index != first_row and local_config.table_cell_cross_merge_cell_placeholder:
# Cross-merge token (left and top)
cell_words = [
Word(str(uuid.uuid4()),
cell_bbox,
config.table_cell_cross_merge_cell_placeholder,
local_config.table_cell_cross_merge_cell_placeholder,
is_structure=True
)
]
Expand All @@ -807,15 +808,15 @@ def get_text_and_words(
row_index = cell.row_index
row_span = cell.row_span
_, cell_words = cell.get_text_and_words(config)
if config.add_prefixes_and_suffixes_as_words:
if config.table_cell_prefix or (config.table_cell_header_prefix and cell.is_column_header):
if local_config.add_prefixes_and_suffixes_as_words:
if local_config.table_cell_prefix or (local_config.table_cell_header_prefix and cell.is_column_header):
words.append(
Word(
str(uuid.uuid4()),
cell_bbox,
config.table_cell_header_prefix
if cell.is_column_header and config.table_cell_header_prefix
else config.table_cell_prefix,
local_config.table_cell_header_prefix
if cell.is_column_header and local_config.table_cell_header_prefix
else local_config.table_cell_prefix,
is_structure=True
)
)
Expand All @@ -827,15 +828,15 @@ def get_text_and_words(
words[-1].row_span = row_span

words.extend(cell_words)
if not cell_words and config.table_cell_empty_cell_placeholder:
words.append(Word(str(uuid.uuid4()), cell_bbox, config.table_cell_empty_cell_placeholder))
if not cell_words and local_config.table_cell_empty_cell_placeholder:
words.append(Word(str(uuid.uuid4()), cell_bbox, local_config.table_cell_empty_cell_placeholder))

if config.table_cell_suffix or (config.table_cell_header_suffix and cell.is_column_header):
if local_config.table_cell_suffix or (local_config.table_cell_header_suffix and cell.is_column_header):
words.append(
Word(
str(uuid.uuid4()),
cell_bbox,
config.table_cell_header_suffix if cell.is_column_header and config.table_cell_header_suffix else config.table_cell_suffix,
local_config.table_cell_header_suffix if cell.is_column_header and local_config.table_cell_header_suffix else local_config.table_cell_suffix,
is_structure=True
)
)
Expand All @@ -847,38 +848,37 @@ def get_text_and_words(
words[-1].row_span = row_span
else:
words.extend(cell_words)
if config.table_row_suffix and config.add_prefixes_and_suffixes_as_words:
words.append(Word(str(uuid.uuid4()), BoundingBox.enclosing_bbox(cells), config.table_row_suffix, is_structure=True))
if local_config.table_row_suffix and local_config.add_prefixes_and_suffixes_as_words:
words.append(Word(str(uuid.uuid4()), BoundingBox.enclosing_bbox(cells), local_config.table_row_suffix, is_structure=True))

if config.table_suffix:
words.append(Word(str(uuid.uuid4()), self.bbox, config.table_suffix))
if local_config.table_suffix:
words.append(Word(str(uuid.uuid4()), self.bbox, local_config.table_suffix))

for w in words:
w.table_id = str(self.id)
w.table_bbox = self.bbox

text = (config.table_prefix if config.add_prefixes_and_suffixes_in_text else "")
text = (local_config.table_prefix if local_config.add_prefixes_and_suffixes_in_text else "")
# Markdown
if config.table_linearization_format == "markdown":
if local_config.table_linearization_format == "markdown":
df = self.to_pandas(
use_columns=True,
config=config
)
has_column = any([isinstance(c, str) for c in df.columns])
if config.table_remove_column_headers:
if local_config.table_remove_column_headers:
headers = df.columns if has_column else ["" for c in df.columns]
else:
headers = df.columns
table = df.to_markdown(
tablefmt=config.table_tabulate_format, headers=headers, index=False
tablefmt=local_config.table_tabulate_format, headers=headers, index=False
)
if config.table_tabulate_remove_extra_hyphens:
if local_config.table_tabulate_remove_extra_hyphens:
while "-" * 2 in table:
table = table.replace("--", "-")
text += table
# Plaintext or HTML
else:
local_config = deepcopy(config)
# FIXME: The cyclomatic complexity of doing things like this will be unsustainable.
if local_config.table_flatten_semi_structured_as_plaintext and self.table_type == TableTypes.SEMI_STRUCTURED:
text = "<p>"
Expand Down Expand Up @@ -1030,7 +1030,7 @@ def get_text_and_words(
text += (local_config.table_row_suffix if local_config.add_prefixes_and_suffixes_in_text else "")
text += local_config.table_row_separator

if local_config.table_add_title_as_caption and self.title:
if local_config.table_add_title_as_caption and self.title and local_config.table_linearization_format == "html":
text += "<caption>" + self.title.get_text() + "</caption>"

text += (local_config.table_suffix if local_config.add_prefixes_and_suffixes_in_text else "")
Expand Down

0 comments on commit 2d06c1f

Please sign in to comment.