Skip to content

Commit c98ee2b

Browse files
authored
Allow getting a ParquetFileReader and the SchemaManifest from an Arrow.FileReader (#430)
1 parent 2879d32 commit c98ee2b

File tree

13 files changed

+548
-8
lines changed

13 files changed

+548
-8
lines changed

cpp/CMakeLists.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,8 @@ add_library(ParquetSharpNative SHARED
6767
arrow/ArrowWriterPropertiesBuilder.cpp
6868
arrow/FileReader.cpp
6969
arrow/FileWriter.cpp
70+
arrow/SchemaField.cpp
71+
arrow/SchemaManifest.cpp
7072
encryption/CryptoFactory.cpp
7173
encryption/DecryptionConfiguration.cpp
7274
encryption/EncryptionConfiguration.cpp

cpp/arrow/FileReader.cpp

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
#include <arrow/c/bridge.h>
55
#include <arrow/record_batch.h>
66
#include <parquet/arrow/reader.h>
7+
#include <parquet/file_reader.h>
78

89
#include "cpp/ParquetSharpExport.h"
910
#include "../ExceptionInfo.h"
@@ -104,6 +105,20 @@ extern "C"
104105
)
105106
}
106107

108+
PARQUETSHARP_EXPORT ExceptionInfo* FileReader_ParquetReader(
109+
FileReader* reader,
110+
parquet::ParquetFileReader** parquet_reader)
111+
{
112+
TRYCATCH(*parquet_reader = reader->parquet_reader();)
113+
}
114+
115+
PARQUETSHARP_EXPORT ExceptionInfo* FileReader_Manifest(
116+
FileReader* reader,
117+
const SchemaManifest** manifest)
118+
{
119+
TRYCATCH(*manifest = &(reader->manifest());)
120+
}
121+
107122
PARQUETSHARP_EXPORT void FileReader_Free(FileReader* reader)
108123
{
109124
delete reader;

cpp/arrow/SchemaField.cpp

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
#include <arrow/c/abi.h>
2+
#include <arrow/c/bridge.h>
3+
#include <parquet/arrow/schema.h>
4+
#include <parquet/exception.h>
5+
6+
#include "cpp/ParquetSharpExport.h"
7+
#include "../ExceptionInfo.h"
8+
9+
using namespace parquet::arrow;
10+
11+
extern "C"
12+
{
13+
PARQUETSHARP_EXPORT ExceptionInfo* SchemaField_ChildrenLength(const SchemaField* field, int32_t* length)
14+
{
15+
TRYCATCH(*length = static_cast<int32_t>(field->children.size());)
16+
}
17+
18+
PARQUETSHARP_EXPORT ExceptionInfo* SchemaField_Child(const SchemaField* field, int32_t index, const SchemaField** child)
19+
{
20+
TRYCATCH(
21+
if (index >= static_cast<int32_t>(field->children.size()))
22+
{
23+
throw std::out_of_range("Child field index out of range");
24+
}
25+
*child = &(field->children[index]);
26+
)
27+
}
28+
29+
PARQUETSHARP_EXPORT ExceptionInfo* SchemaField_ColumnIndex(const SchemaField* field, int32_t* column_index)
30+
{
31+
TRYCATCH(
32+
*column_index = field->column_index;
33+
)
34+
}
35+
36+
PARQUETSHARP_EXPORT ExceptionInfo* SchemaField_Field(const SchemaField* field, struct ArrowSchema* arrow_field)
37+
{
38+
TRYCATCH(
39+
PARQUET_THROW_NOT_OK(arrow::ExportField(*(field->field), arrow_field));
40+
)
41+
}
42+
}

cpp/arrow/SchemaManifest.cpp

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
#include <parquet/arrow/schema.h>
2+
#include <parquet/exception.h>
3+
4+
#include "cpp/ParquetSharpExport.h"
5+
#include "../ExceptionInfo.h"
6+
7+
using namespace parquet::arrow;
8+
9+
extern "C"
10+
{
11+
PARQUETSHARP_EXPORT ExceptionInfo* SchemaManifest_SchemaFieldsLength(const SchemaManifest* manifest, int32_t* length)
12+
{
13+
TRYCATCH(*length = static_cast<int32_t>(manifest->schema_fields.size());)
14+
}
15+
16+
PARQUETSHARP_EXPORT ExceptionInfo* SchemaManifest_SchemaField(const SchemaManifest* manifest, int32_t index, const SchemaField** field)
17+
{
18+
TRYCATCH(
19+
if (index >= static_cast<int32_t>(manifest->schema_fields.size()))
20+
{
21+
throw std::out_of_range("Field index out of range");
22+
}
23+
*field = &(manifest->schema_fields[index]);
24+
)
25+
}
26+
27+
PARQUETSHARP_EXPORT ExceptionInfo* SchemaManifest_GetColumnField(const SchemaManifest* manifest, int32_t column_index, const SchemaField** field)
28+
{
29+
TRYCATCH(
30+
PARQUET_THROW_NOT_OK(manifest->GetColumnField(column_index, field));
31+
)
32+
}
33+
34+
PARQUETSHARP_EXPORT ExceptionInfo* SchemaManifest_GetParent(const SchemaManifest* manifest, const SchemaField* field, const SchemaField** parent)
35+
{
36+
TRYCATCH(*parent = manifest->GetParent(field);)
37+
}
38+
}

csharp.test/Arrow/TestFileReader.cs

Lines changed: 201 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
using System;
2+
using System.Collections.Generic;
23
using System.Linq;
34
using System.Threading.Tasks;
45
using Apache.Arrow;
@@ -195,6 +196,168 @@ public async Task TestReadSelectedColumns()
195196
Assert.That(rowsRead, Is.EqualTo(RowsPerRowGroup * NumRowGroups));
196197
}
197198

199+
[Test]
200+
public void TestAccessUnderlyingReader()
201+
{
202+
using var buffer = new ResizableBuffer();
203+
WriteTestFile(buffer);
204+
205+
using var inStream = new BufferReader(buffer);
206+
using var fileReader = new FileReader(inStream);
207+
using var parquetReader = fileReader.ParquetReader;
208+
209+
// Verify we can access column statistics
210+
for (var rowGroupIdx = 0; rowGroupIdx < NumRowGroups; ++rowGroupIdx)
211+
{
212+
using var rowGroup = parquetReader.RowGroup(rowGroupIdx);
213+
using var colMetadata = rowGroup.MetaData.GetColumnChunkMetaData(1);
214+
using var stats = colMetadata.Statistics as Statistics<int>;
215+
Assert.That(stats, Is.Not.Null);
216+
Assert.That(stats!.HasMinMax);
217+
Assert.That(stats.Min, Is.EqualTo(rowGroupIdx * RowsPerRowGroup));
218+
Assert.That(stats.Max, Is.EqualTo((rowGroupIdx + 1) * RowsPerRowGroup - 1));
219+
}
220+
}
221+
222+
[Test]
223+
public void TestAccessUnderlyingReaderAfterDisposed()
224+
{
225+
using var buffer = new ResizableBuffer();
226+
WriteTestFile(buffer);
227+
228+
using var inStream = new BufferReader(buffer);
229+
ParquetFileReader parquetReader;
230+
using (var fileReader = new FileReader(inStream))
231+
{
232+
parquetReader = fileReader.ParquetReader;
233+
}
234+
235+
using (parquetReader)
236+
{
237+
var exception = Assert.Throws<NullReferenceException>(() => { _ = parquetReader.FileMetaData; });
238+
Assert.That(exception!.Message, Does.Contain("owning parent has been disposed"));
239+
}
240+
}
241+
242+
[Test]
243+
public void TestSchemaManifest()
244+
{
245+
using var buffer = new ResizableBuffer();
246+
WriteNestedTestFile(buffer);
247+
248+
using var inStream = new BufferReader(buffer);
249+
using var fileReader = new FileReader(inStream);
250+
251+
var manifest = fileReader.SchemaManifest;
252+
var fields = manifest.SchemaFields;
253+
254+
Assert.That(fields.Count, Is.EqualTo(2));
255+
256+
var structField = fields[0];
257+
var structArrowField = structField.Field;
258+
259+
Assert.That(structArrowField.Name, Is.EqualTo("test_struct"));
260+
Assert.That(structArrowField.DataType.TypeId, Is.EqualTo(ArrowTypeId.Struct));
261+
262+
Assert.That(structField.ColumnIndex, Is.EqualTo(-1));
263+
var structFields = structField.Children;
264+
Assert.That(structFields.Count, Is.EqualTo(2));
265+
Assert.That(structFields[0].ColumnIndex, Is.EqualTo(0));
266+
Assert.That(structFields[1].ColumnIndex, Is.EqualTo(1));
267+
var structArrowFieldA = structFields[0].Field;
268+
var structArrowFieldB = structFields[1].Field;
269+
Assert.That(structArrowFieldA.Name, Is.EqualTo("a"));
270+
Assert.That(structArrowFieldA.DataType.TypeId, Is.EqualTo(ArrowTypeId.Int32));
271+
Assert.That(structArrowFieldB.Name, Is.EqualTo("b"));
272+
Assert.That(structArrowFieldB.DataType.TypeId, Is.EqualTo(ArrowTypeId.Float));
273+
274+
Assert.That(fields[1].Children.Count, Is.EqualTo(0));
275+
Assert.That(fields[1].ColumnIndex, Is.EqualTo(2));
276+
var xArrowField = fields[1].Field;
277+
Assert.That(xArrowField.Name, Is.EqualTo("x"));
278+
Assert.That(xArrowField.DataType.TypeId, Is.EqualTo(ArrowTypeId.Int32));
279+
}
280+
281+
[Test]
282+
public void TestSchemaManifestGetSingleField()
283+
{
284+
using var buffer = new ResizableBuffer();
285+
WriteNestedTestFile(buffer);
286+
287+
using var inStream = new BufferReader(buffer);
288+
using var fileReader = new FileReader(inStream);
289+
290+
var manifest = fileReader.SchemaManifest;
291+
var field = manifest.SchemaField(1);
292+
Assert.That(field, Is.Not.Null);
293+
var arrowField = field.Field;
294+
Assert.That(arrowField.Name, Is.EqualTo("x"));
295+
Assert.That(arrowField.DataType.TypeId, Is.EqualTo(ArrowTypeId.Int32));
296+
297+
var exception = Assert.Throws<ParquetException>(() => manifest.SchemaField(2));
298+
Assert.That(exception!.Message, Does.Contain("out of range"));
299+
}
300+
301+
[Test]
302+
public void TestSchemaManifestGetColumnField()
303+
{
304+
using var buffer = new ResizableBuffer();
305+
WriteNestedTestFile(buffer);
306+
307+
using var inStream = new BufferReader(buffer);
308+
using var fileReader = new FileReader(inStream);
309+
310+
var manifest = fileReader.SchemaManifest;
311+
var field = manifest.GetColumnField(2);
312+
Assert.That(field, Is.Not.Null);
313+
var arrowField = field.Field;
314+
Assert.That(arrowField.Name, Is.EqualTo("x"));
315+
Assert.That(arrowField.DataType.TypeId, Is.EqualTo(ArrowTypeId.Int32));
316+
317+
var exception = Assert.Throws<ParquetException>(() => manifest.GetColumnField(3));
318+
Assert.That(exception!.Message, Does.Contain("Column index 3"));
319+
}
320+
321+
[Test]
322+
public void TestSchemaManifestGetFieldParent()
323+
{
324+
using var buffer = new ResizableBuffer();
325+
WriteNestedTestFile(buffer);
326+
327+
using var inStream = new BufferReader(buffer);
328+
using var fileReader = new FileReader(inStream);
329+
330+
var manifest = fileReader.SchemaManifest;
331+
var field = manifest.GetColumnField(1);
332+
var parent = manifest.GetParent(field);
333+
334+
Assert.That(parent, Is.Not.Null);
335+
var arrowField = parent!.Field;
336+
Assert.That(arrowField.Name, Is.EqualTo("test_struct"));
337+
Assert.That(arrowField.DataType.TypeId, Is.EqualTo(ArrowTypeId.Struct));
338+
339+
var grandparent = manifest.GetParent(parent);
340+
Assert.That(grandparent, Is.Null);
341+
}
342+
343+
[Test]
344+
public void TestAccessSchemaManifestFieldAfterDisposed()
345+
{
346+
using var buffer = new ResizableBuffer();
347+
WriteTestFile(buffer);
348+
349+
using var inStream = new BufferReader(buffer);
350+
SchemaField field;
351+
using (var fileReader = new FileReader(inStream))
352+
{
353+
var manifest = fileReader.SchemaManifest;
354+
field = manifest.SchemaFields[0];
355+
}
356+
357+
var exception = Assert.Throws<NullReferenceException>(() => { _ = field.Field; });
358+
Assert.That(exception!.Message, Does.Contain("owning parent has been disposed"));
359+
}
360+
198361
private static void WriteTestFile(ResizableBuffer buffer)
199362
{
200363
var columns = new Column[]
@@ -226,6 +389,44 @@ private static void WriteTestFile(ResizableBuffer buffer)
226389
fileWriter.Close();
227390
}
228391

392+
private static void WriteNestedTestFile(ResizableBuffer buffer)
393+
{
394+
var fields = new[]
395+
{
396+
new Field("test_struct", new StructType(
397+
new[]
398+
{
399+
new Field("a", new Int32Type(), false),
400+
new Field("b", new FloatType(), false),
401+
}), true),
402+
new Field("x", new Int32Type(), false),
403+
};
404+
var schema = new Apache.Arrow.Schema(fields, null);
405+
406+
using var outStream = new BufferOutputStream(buffer);
407+
using var writer = new FileWriter(outStream, schema);
408+
for (var rowGroup = 0; rowGroup < NumRowGroups; ++rowGroup)
409+
{
410+
var start = rowGroup * RowsPerRowGroup;
411+
var arrays = new List<IArrowArray>
412+
{
413+
new StructArray(fields[0].DataType, RowsPerRowGroup, new IArrowArray[]
414+
{
415+
new Int32Array.Builder().AppendRange(Enumerable.Range(start, RowsPerRowGroup).ToArray()).Build(),
416+
new FloatArray.Builder().AppendRange(Enumerable.Range(start, RowsPerRowGroup).Select(i => i * 0.1f).ToArray())
417+
.Build(),
418+
}, new ArrowBuffer.BitmapBuilder().AppendRange(true, RowsPerRowGroup).Build()),
419+
new Int32Array.Builder().AppendRange(Enumerable.Range(start, RowsPerRowGroup).ToArray()).Build()
420+
};
421+
422+
var batch = new RecordBatch(schema, arrays, RowsPerRowGroup);
423+
424+
writer.WriteRecordBatch(batch);
425+
}
426+
427+
writer.Close();
428+
}
429+
229430
private const int NumRowGroups = 4;
230431
private const int RowsPerRowGroup = 100;
231432
}

csharp/Arrow/FileReader.cs

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -138,6 +138,30 @@ public unsafe IArrowArrayStream GetRecordBatchReader(
138138
return CArrowArrayStreamImporter.ImportArrayStream(&cStream);
139139
}
140140

141+
/// <summary>
142+
/// Get the underlying ParquetFileReader used by this Arrow FileReader
143+
/// </summary>
144+
public ParquetFileReader ParquetReader
145+
{
146+
get
147+
{
148+
var readerPtr = ExceptionInfo.Return<IntPtr>(_handle, FileReader_ParquetReader);
149+
return new ParquetFileReader(new ChildParquetHandle(readerPtr, _handle));
150+
}
151+
}
152+
153+
/// <summary>
154+
/// Get the schema manifest, which describes the relationship between the Arrow schema and Parquet schema
155+
/// </summary>
156+
public SchemaManifest SchemaManifest
157+
{
158+
get
159+
{
160+
var manifestPtr = ExceptionInfo.Return<IntPtr>(_handle, FileReader_Manifest);
161+
return new SchemaManifest(new ChildParquetHandle(manifestPtr, _handle));
162+
}
163+
}
164+
141165
public void Dispose()
142166
{
143167
_handle.Dispose();
@@ -165,6 +189,12 @@ private static extern IntPtr FileReader_OpenFile(
165189
private static extern unsafe IntPtr FileReader_GetRecordBatchReader(
166190
IntPtr reader, int* rowGroups, int rowGroupsCount, int* columns, int columnsCount, CArrowArrayStream* stream);
167191

192+
[DllImport(ParquetDll.Name)]
193+
private static extern IntPtr FileReader_ParquetReader(IntPtr reader, out IntPtr parquetReader);
194+
195+
[DllImport(ParquetDll.Name)]
196+
private static extern IntPtr FileReader_Manifest(IntPtr reader, out IntPtr manifest);
197+
168198
[DllImport(ParquetDll.Name)]
169199
private static extern void FileReader_Free(IntPtr reader);
170200

0 commit comments

Comments
 (0)