Skip to content

Commit

Permalink
fix padding required for arrow offset testing
Browse files Browse the repository at this point in the history
  • Loading branch information
bkmartinjr committed Jan 26, 2025
1 parent b3cdcf8 commit 006140a
Show file tree
Hide file tree
Showing 3 changed files with 38 additions and 15 deletions.
2 changes: 1 addition & 1 deletion apis/python/tests/ht/_ht_test_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
#
# data corruption due to incorrect Arrow array offset handling
# See also sc-62104
"sc-61239_workaround": True,
"sc-61239_workaround": False,
# creating array with timestamp==0 fails in 1.15 (regression)
"sc-61054_workaround": True,
# Tables returned by SparseNDArray.read have incorrect nullability in schema fields
Expand Down
46 changes: 36 additions & 10 deletions apis/python/tests/ht/_ht_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -342,25 +342,51 @@ def arrow_slice(draw: st.DrawFn, size: int) -> ArrowSlice:
return (offset, length)


def pad_array(arr: npt.NDArray[Any], draw: st.DrawFn) -> pa.Array:
"""Strategy helper: add padding to one or both ends of the array. This tests for Arrow array "offset" handling."""
def pad_array(arr: pa.Array | npt.NDArray[Any], draw: st.DrawFn) -> pa.Array:
"""Strategy helper: add padding to one or both ends of the array. This tests for Arrow array
offset & length handling."""

if HT_TEST_CONFIG.get("sc-61239_workaround", False):
return pa.array(arr)

if not isinstance(arr, pa.Array):
arr = pa.array(arr)

head = draw(st.integers(min_value=0, max_value=16))
tail = draw(st.integers(min_value=0, max_value=16))
if not bool(head or tail):
return pa.array(arr)
return arr

if pa.types.is_dictionary(arr.type):
padding = draw(st.integers(min_value=0, max_value=len(arr.dictionary) - 1))
head_arr = pa.DictionaryArray.from_arrays(
indices=pa.array([padding] * head, type=arr.type.index_type),
dictionary=arr.dictionary,
ordered=arr.type.ordered,
)
tail_arr = pa.DictionaryArray.from_arrays(
indices=pa.array([padding] * tail, type=arr.type.index_type),
dictionary=arr.dictionary,
ordered=arr.type.ordered,
)

else:
if pa.types.is_large_string(arr.type) or pa.types.is_string(arr.type):
pad_type = str
elif pa.types.is_large_binary(arr.type) or pa.types.is_binary(arr.type):
pad_type = bytes
elif pa.types.is_timestamp(arr.type):
pad_type = np.int64
else:
pad_type = np.dtype(arr.type.to_pandas_dtype()).type

padding = draw(st.from_type(arr.dtype.type))
padding = draw(st.from_type(pad_type))
head_arr = pa.array([padding] * head).cast(arr.type)
tail_arr = pa.array([padding] * tail).cast(arr.type)

shape = (arr.shape[0] + head + tail, *arr.shape[1:])
padded_arr = np.empty_like(arr, shape=shape)
padded_arr[0:head] = padding
padded_arr[head : head + len(arr)] = arr
padded_arr[head + len(arr) :] = padding
return pa.array(padded_arr)[head : head + len(arr)]
assert arr.type == head_arr.type == tail_arr.type
padded_arr = pa.chunked_array([head_arr, arr, tail_arr]).combine_chunks()
return padded_arr.slice(head, len(arr))


@st.composite
Expand Down
5 changes: 1 addition & 4 deletions apis/python/tests/ht/test_ht_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -603,10 +603,7 @@ def get_max_size() -> int:
if draw(st.booleans()) and not HT_TEST_CONFIG["sc-61239_workaround"]:
batches = tbl.to_batches()
batch_to_pad = draw(st.integers(min_value=0, max_value=len(batches) - 1))
batch_arrays = [
pad_array(arr.to_numpy(zero_copy_only=(arr.type != pa.bool_())), draw)
for arr in batches[batch_to_pad].columns
]
batch_arrays = [pad_array(arr, draw) for arr in batches[batch_to_pad].columns]
batches[batch_to_pad] = pa.RecordBatch.from_arrays(
batch_arrays, schema=tbl.schema
)
Expand Down

0 comments on commit 006140a

Please sign in to comment.