Skip to content

Commit 3ac4443

Browse files
badGarnetcragwolfe
andauthored
feat: add option to skip table chunking (#4338)
This PR adds a new option to chunker `skip_table_chunking`: when `True` table elements are not split by the chunking process, default to `False` (current behavior). --------- Co-authored-by: cragwolfe <crag@unstructured.io>
1 parent dfb1653 commit 3ac4443

9 files changed

Lines changed: 170 additions & 4 deletions

File tree

CHANGELOG.md

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,9 @@
1+
## 0.22.21
2+
3+
### Enhancements
4+
5+
- **Skip table chunking option**: Add `skip_table_chunking` to basic/title chunking options. When `True`, `Table` elements are passed through unchanged without being split into `TableChunk` elements, regardless of their size. Defaults to `False` to preserve existing behavior.
6+
17
## 0.22.20
28

39
### Enhancements

test_unstructured/chunking/test_base.py

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -115,6 +115,18 @@ def it_knows_whether_to_repeat_table_headers_by_default(
115115
):
116116
assert ChunkingOptions(**kwargs).repeat_table_headers is expected_value
117117

118+
@pytest.mark.parametrize(
119+
("kwargs", "expected_value"),
120+
[
121+
({"skip_table_chunking": True}, True),
122+
({"skip_table_chunking": False}, False),
123+
({"skip_table_chunking": None}, False),
124+
({}, False),
125+
],
126+
)
127+
def it_knows_whether_to_skip_table_chunking(self, kwargs: dict[str, Any], expected_value: bool):
128+
assert ChunkingOptions(**kwargs).skip_table_chunking is expected_value
129+
118130
@pytest.mark.parametrize("n_chars", [-1, -42])
119131
def it_rejects_new_after_n_chars_for_n_less_than_zero(self, n_chars: int):
120132
with pytest.raises(
@@ -694,6 +706,29 @@ def it_can_chunk_elements_with_none_text_without_error(self):
694706
assert len(chunks) == 1
695707
assert chunks[0].text == "hello world"
696708

709+
def it_yields_an_oversized_table_unchanged_when_skip_table_chunking_is_True(self):
710+
table_text = "cell " * 200 # 1000 chars, well above default max_characters=500
711+
table = Table(table_text.strip())
712+
opts = ChunkingOptions(max_characters=100, skip_table_chunking=True)
713+
pre_chunk = PreChunk([table], overlap_prefix="", opts=opts)
714+
715+
chunks = list(pre_chunk.iter_chunks())
716+
717+
assert len(chunks) == 1
718+
assert isinstance(chunks[0], Table)
719+
assert chunks[0] is table
720+
721+
def it_splits_an_oversized_table_when_skip_table_chunking_is_False(self):
722+
table_text = "cell " * 200 # 1000 chars, well above max_characters=100
723+
table = Table(table_text.strip())
724+
opts = ChunkingOptions(max_characters=100, skip_table_chunking=False)
725+
pre_chunk = PreChunk([table], overlap_prefix="", opts=opts)
726+
727+
chunks = list(pre_chunk.iter_chunks())
728+
729+
assert len(chunks) > 1
730+
assert all(isinstance(c, TableChunk) for c in chunks)
731+
697732
@pytest.mark.parametrize(
698733
("max_characters", "combine_text_under_n_chars", "expected_value"),
699734
[

test_unstructured/chunking/test_basic.py

Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -183,6 +183,44 @@ def test_it_repeats_table_headers_by_default_but_can_opt_out():
183183
]
184184

185185

186+
def test_skip_table_chunking_passes_oversized_table_through_unchanged():
187+
table_text = "cell " * 200 # 1000 chars, well above max_characters=100
188+
table = Table(table_text.strip())
189+
text_before = Text("Hello world")
190+
text_after = Text("Goodbye world")
191+
192+
chunks = chunk_elements(
193+
[text_before, table, text_after],
194+
max_characters=100,
195+
skip_table_chunking=True,
196+
)
197+
198+
assert len(chunks) == 3
199+
assert isinstance(chunks[0], CompositeElement)
200+
assert isinstance(chunks[1], Table)
201+
assert isinstance(chunks[2], CompositeElement)
202+
# -- table text is unchanged --
203+
assert chunks[1].text == table_text.strip()
204+
205+
206+
def test_skip_table_chunking_does_not_affect_text_element_chunking():
207+
long_text = Text("word " * 200)
208+
table = Table("small table")
209+
210+
chunks = chunk_elements(
211+
[long_text, table],
212+
max_characters=100,
213+
skip_table_chunking=True,
214+
)
215+
216+
# -- long text element is still split, table is still isolated --
217+
text_chunks = [c for c in chunks if isinstance(c, CompositeElement)]
218+
table_chunks = [c for c in chunks if isinstance(c, Table)]
219+
assert len(text_chunks) > 1
220+
assert len(table_chunks) == 1
221+
assert table_chunks[0].text == "small table"
222+
223+
186224
# ------------------------------------------------------------------------------------------------
187225
# UNIT TESTS
188226
# ------------------------------------------------------------------------------------------------
@@ -229,6 +267,23 @@ def it_supports_the_repeat_table_headers_option(
229267
_, opts = _chunk_elements_.call_args.args
230268
assert opts.repeat_table_headers is expected_value
231269

270+
@pytest.mark.parametrize(
271+
("kwargs", "expected_value"),
272+
[
273+
({"skip_table_chunking": True}, True),
274+
({"skip_table_chunking": False}, False),
275+
({"skip_table_chunking": None}, False),
276+
({}, False),
277+
],
278+
)
279+
def it_supports_the_skip_table_chunking_option(
280+
self, kwargs: dict[str, Any], expected_value: bool, _chunk_elements_: Mock
281+
):
282+
chunk_elements([], **kwargs)
283+
284+
_, opts = _chunk_elements_.call_args.args
285+
assert opts.skip_table_chunking is expected_value
286+
232287
# -- fixtures --------------------------------------------------------------------------------
233288

234289
@pytest.fixture()

test_unstructured/chunking/test_title.py

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,34 @@ def test_it_splits_oversized_table():
6565
assert all(isinstance(chunk, TableChunk) for chunk in chunks)
6666

6767

68+
def test_skip_table_chunking_passes_oversized_table_through_unchanged():
69+
elements = elements_from_json(input_path("chunking/table_2000.json"))
70+
71+
chunks = chunk_by_title(elements, skip_table_chunking=True)
72+
73+
assert len(chunks) == 1
74+
assert isinstance(chunks[0], Table)
75+
76+
77+
def test_skip_table_chunking_does_not_combine_table_with_adjacent_text():
78+
table_text = "cell " * 200
79+
table = Table(table_text.strip())
80+
text_before = Text("Hello world")
81+
text_after = Text("Goodbye world")
82+
83+
chunks = chunk_by_title(
84+
[text_before, table, text_after],
85+
max_characters=5000,
86+
combine_text_under_n_chars=5000,
87+
skip_table_chunking=True,
88+
)
89+
90+
assert isinstance(chunks[0], CompositeElement)
91+
assert isinstance(chunks[1], Table)
92+
assert isinstance(chunks[2], CompositeElement)
93+
assert chunks[1].text == table_text.strip()
94+
95+
6896
def test_it_repeats_table_headers_by_default_but_can_opt_out():
6997
table_html = (
7098
"<table>"
@@ -564,6 +592,23 @@ def it_supports_the_repeat_table_headers_option(
564592
_, opts = _chunk_by_title_.call_args.args
565593
assert opts.repeat_table_headers is expected_value
566594

595+
@pytest.mark.parametrize(
596+
("kwargs", "expected_value"),
597+
[
598+
({"skip_table_chunking": True}, True),
599+
({"skip_table_chunking": False}, False),
600+
({"skip_table_chunking": None}, False),
601+
({}, False),
602+
],
603+
)
604+
def it_supports_the_skip_table_chunking_option(
605+
self, kwargs: dict[str, Any], expected_value: bool, _chunk_by_title_: Mock
606+
):
607+
chunk_by_title([], **kwargs)
608+
609+
_, opts = _chunk_by_title_.call_args.args
610+
assert opts.skip_table_chunking is expected_value
611+
567612
# -- fixtures --------------------------------------------------------------------------------
568613

569614
@pytest.fixture()

unstructured/__version__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "0.22.20" # pragma: no cover
1+
__version__ = "0.22.21" # pragma: no cover

unstructured/chunking/base.py

Lines changed: 15 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -199,6 +199,15 @@ def repeat_table_headers(self) -> bool:
199199
arg_value = self._kwargs.get("repeat_table_headers")
200200
return True if arg_value is None else bool(arg_value)
201201

202+
@cached_property
203+
def skip_table_chunking(self) -> bool:
204+
"""When True, Table elements are passed through without chunking.
205+
206+
Default value is `False`.
207+
"""
208+
arg_value = self._kwargs.get("skip_table_chunking")
209+
return False if arg_value is None else bool(arg_value)
210+
202211
@cached_property
203212
def inter_chunk_overlap(self) -> int:
204213
"""Characters of overlap to add between chunks.
@@ -662,9 +671,12 @@ def iter_chunks(self) -> Iterator[CompositeElement | Table | TableChunk]:
662671
# -- it may need to be split into multiple `TableChunk` elements and that operation is
663672
# -- quite specialized.
664673
if len(self._elements) == 1 and isinstance(self._elements[0], Table):
665-
yield from _TableChunker.iter_chunks(
666-
self._elements[0], self._overlap_prefix, self._opts
667-
)
674+
if self._opts.skip_table_chunking:
675+
yield self._elements[0]
676+
else:
677+
yield from _TableChunker.iter_chunks(
678+
self._elements[0], self._overlap_prefix, self._opts
679+
)
668680
else:
669681
yield from _Chunker.iter_chunks(self._elements, self._text, self._opts)
670682

unstructured/chunking/basic.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ def chunk_elements(
3333
overlap_all: Optional[bool] = None,
3434
tokenizer: Optional[str] = None,
3535
repeat_table_headers: Optional[bool] = None,
36+
skip_table_chunking: Optional[bool] = None,
3637
) -> list[Element]:
3738
"""Combine sequential `elements` into chunks, respecting specified text-length limits.
3839
@@ -80,6 +81,9 @@ def chunk_elements(
8081
repeat_table_headers
8182
Default: `True`. When `True`, repeated table-header behavior is enabled for chunked table
8283
continuations. Specify `False` to opt out and preserve legacy table-chunk behavior.
84+
skip_table_chunking
85+
Default: `False`. When `True`, `Table` elements are passed through unchanged without
86+
being split into `TableChunk` elements, regardless of their size.
8387
"""
8488
# -- raises ValueError on invalid parameters --
8589
opts = _BasicChunkingOptions.new(
@@ -92,6 +96,7 @@ def chunk_elements(
9296
overlap_all=overlap_all,
9397
tokenizer=tokenizer,
9498
repeat_table_headers=repeat_table_headers,
99+
skip_table_chunking=skip_table_chunking,
95100
)
96101

97102
return _chunk_elements(elements, opts)

unstructured/chunking/dispatch.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,9 @@ def add_chunking_strategy(func: Callable[_P, list[Element]]) -> Callable[_P, lis
7171
+ "\n\t\trepeat_table_headers"
7272
+ "\n\t\t\tDefault: True. Repeat detected table headers on continuation"
7373
+ "\n\t\t\ttable chunks. Set to False to opt out."
74+
+ "\n\t\tskip_table_chunking"
75+
+ "\n\t\t\tDefault: False. When True, Table elements are passed through"
76+
+ "\n\t\t\tunchanged without being split into TableChunk elements."
7477
)
7578

7679
@functools.wraps(func)

unstructured/chunking/title.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@ def chunk_by_title(
3434
overlap_all: Optional[bool] = None,
3535
tokenizer: Optional[str] = None,
3636
repeat_table_headers: Optional[bool] = None,
37+
skip_table_chunking: Optional[bool] = None,
3738
) -> list[Element]:
3839
"""Uses title elements to identify sections within the document for chunking.
3940
@@ -87,6 +88,9 @@ def chunk_by_title(
8788
repeat_table_headers
8889
Default: `True`. When `True`, repeated table-header behavior is enabled for chunked table
8990
continuations. Specify `False` to opt out and preserve legacy table-chunk behavior.
91+
skip_table_chunking
92+
Default: `False`. When `True`, `Table` elements are passed through unchanged without
93+
being split into `TableChunk` elements, regardless of their size.
9094
"""
9195
opts = _ByTitleChunkingOptions.new(
9296
combine_text_under_n_chars=combine_text_under_n_chars,
@@ -100,6 +104,7 @@ def chunk_by_title(
100104
overlap_all=overlap_all,
101105
tokenizer=tokenizer,
102106
repeat_table_headers=repeat_table_headers,
107+
skip_table_chunking=skip_table_chunking,
103108
)
104109
return _chunk_by_title(elements, opts)
105110

0 commit comments

Comments
 (0)