Skip to content

Commit 7125245

Browse files
authored
Skip row groups based on statistics (#4)
1 parent 6d2a2e2 commit 7125245

17 files changed

+922
-12
lines changed

ParquetSharp.Dataset.Test/Filter/TestIntFilter.cs

Lines changed: 127 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,9 @@
11
using System;
2+
using System.Collections.Generic;
23
using System.Linq;
34
using Apache.Arrow;
45
using NUnit.Framework;
6+
using ParquetSharp.Dataset.Filter;
57

68
namespace ParquetSharp.Dataset.Test.Filter;
79

@@ -128,6 +130,33 @@ public void TestComputeIntRangeMask((long, long) filterRange)
128130
TestComputeIntRangeMask<ulong, UInt64Array, UInt64Array.Builder>(rangeStart, rangeEnd, ULongValues, val => checked((long)val));
129131
}
130132

133+
[Theory]
134+
public void TestIntEqualityIncludeRowGroup(long filterValue)
135+
{
136+
TestIntEqualityIncludeRowGroup(filterValue, SByteValues, val => val);
137+
TestIntEqualityIncludeRowGroup(filterValue, ShortValues, val => val);
138+
TestIntEqualityIncludeRowGroup(filterValue, IntValues, val => val);
139+
TestIntEqualityIncludeRowGroup(filterValue, LongValues, val => val);
140+
TestIntEqualityIncludeRowGroup(filterValue, ByteValues, val => val);
141+
TestIntEqualityIncludeRowGroup(filterValue, UShortValues, val => val);
142+
TestIntEqualityIncludeRowGroup(filterValue, UIntValues, val => val);
143+
TestIntEqualityIncludeRowGroup(filterValue, ULongValues, val => checked((long)val));
144+
}
145+
146+
[Theory]
147+
public void TestIntRangeIncludeRowGroup((long, long) filterRange)
148+
{
149+
var (rangeStart, rangeEnd) = filterRange;
150+
TestIntRangeIncludeRowGroup(rangeStart, rangeEnd, SByteValues, val => val);
151+
TestIntRangeIncludeRowGroup(rangeStart, rangeEnd, ShortValues, val => val);
152+
TestIntRangeIncludeRowGroup(rangeStart, rangeEnd, IntValues, val => val);
153+
TestIntRangeIncludeRowGroup(rangeStart, rangeEnd, LongValues, val => val);
154+
TestIntRangeIncludeRowGroup(rangeStart, rangeEnd, ByteValues, val => val);
155+
TestIntRangeIncludeRowGroup(rangeStart, rangeEnd, UShortValues, val => val);
156+
TestIntRangeIncludeRowGroup(rangeStart, rangeEnd, UIntValues, val => val);
157+
TestIntRangeIncludeRowGroup(rangeStart, rangeEnd, ULongValues, val => checked((long)val));
158+
}
159+
131160
private static void TestComputeIntEqualityMask<T, TArray, TBuilder>(long filterValue, T[] values, Func<T, long> checkedCast)
132161
where T : struct
133162
where TArray : PrimitiveArray<T>
@@ -203,6 +232,104 @@ private static void TestComputeIntRangeMask<T, TArray, TBuilder>(long rangeStart
203232
}
204233
}
205234

235+
private static void TestIntEqualityIncludeRowGroup<T>(long filterValue, T[] values, Func<T, long> checkedCast)
236+
where T : IComparable<T>
237+
{
238+
var filter = Col.Named("x").IsEqualTo(filterValue);
239+
240+
var statsRanges = values
241+
.SelectMany(min => values.Select(max => (min, max)))
242+
.Where(range => range.max.CompareTo(range.min) >= 0)
243+
.ToArray();
244+
foreach (var statsRange in statsRanges)
245+
{
246+
var rowGroupStats = new Dictionary<string, LogicalStatistics>
247+
{
248+
{ "x", new LogicalStatistics<T>(statsRange.min, statsRange.max) }
249+
};
250+
251+
var filterValueInRange = true;
252+
try
253+
{
254+
var longMin = checkedCast(statsRange.min);
255+
if (filterValue < longMin)
256+
{
257+
filterValueInRange = false;
258+
}
259+
}
260+
catch (OverflowException)
261+
{
262+
filterValueInRange = false;
263+
}
264+
265+
try
266+
{
267+
var longMax = checkedCast(statsRange.max);
268+
if (filterValue > longMax)
269+
{
270+
filterValueInRange = false;
271+
}
272+
}
273+
catch (OverflowException)
274+
{
275+
}
276+
277+
var includeRowGroup = filter.IncludeRowGroup(rowGroupStats);
278+
Assert.That(
279+
includeRowGroup, Is.EqualTo(filterValueInRange),
280+
$"Expected {typeof(T)} stats range [{statsRange.min}, {statsRange.max}] inclusion to be {filterValueInRange}");
281+
}
282+
}
283+
284+
private static void TestIntRangeIncludeRowGroup<T>(long rangeStart, long rangeEnd, T[] values, Func<T, long> checkedCast)
285+
where T : IComparable<T>
286+
{
287+
var filter = Col.Named("x").IsInRange(rangeStart, rangeEnd);
288+
289+
var statsRanges = values
290+
.SelectMany(min => values.Select(max => (min, max)))
291+
.Where(range => range.max.CompareTo(range.min) >= 0)
292+
.ToArray();
293+
foreach (var statsRange in statsRanges)
294+
{
295+
var rowGroupStats = new Dictionary<string, LogicalStatistics>
296+
{
297+
{ "x", new LogicalStatistics<T>(statsRange.min, statsRange.max) }
298+
};
299+
300+
var rangesOverlap = true;
301+
try
302+
{
303+
var longMin = checkedCast(statsRange.min);
304+
if (longMin > rangeEnd)
305+
{
306+
rangesOverlap = false;
307+
}
308+
}
309+
catch (OverflowException)
310+
{
311+
rangesOverlap = false;
312+
}
313+
314+
try
315+
{
316+
var longMax = checkedCast(statsRange.max);
317+
if (longMax < rangeStart)
318+
{
319+
rangesOverlap = false;
320+
}
321+
}
322+
catch (OverflowException)
323+
{
324+
}
325+
326+
var includeRowGroup = filter.IncludeRowGroup(rowGroupStats);
327+
Assert.That(
328+
includeRowGroup, Is.EqualTo(rangesOverlap),
329+
$"Expected {typeof(T)} stats range [{statsRange.min}, {statsRange.max}] inclusion to be {rangesOverlap}");
330+
}
331+
}
332+
206333
private static TArray BuildArray<T, TArray, TBuilder>(T[] values)
207334
where T : struct
208335
where TArray : IArrowArray

0 commit comments

Comments
 (0)