From 4deb3707d54ccef6d8d1d19ab61ae03ef1322fa1 Mon Sep 17 00:00:00 2001 From: Adam Reeve Date: Wed, 14 Aug 2024 07:33:52 +1200 Subject: [PATCH] Fix reading required string values inside an optional group (#481) --- csharp.test/TestLogicalTypeRoundtrip.cs | 159 ++++++++++++++++++ .../LogicalBatchReaderFactory.cs | 4 +- 2 files changed, 162 insertions(+), 1 deletion(-) diff --git a/csharp.test/TestLogicalTypeRoundtrip.cs b/csharp.test/TestLogicalTypeRoundtrip.cs index 084fbb71..b9ba77b3 100644 --- a/csharp.test/TestLogicalTypeRoundtrip.cs +++ b/csharp.test/TestLogicalTypeRoundtrip.cs @@ -1225,6 +1225,165 @@ public static void TestRequiredNestedRoundtripString() CheckNestedRoundtrip(values, elementNode); } + [Test] + public static void TestRequiredIntInOptionalGroup() + { + var values = Enumerable.Range(0, 100) + .Select(i => i % 10 == 3 ? (Nested?) null : new Nested(i)) + .ToArray(); + using var noneType = LogicalType.None(); + using var elementNode = new PrimitiveNode("element", Repetition.Required, noneType, PhysicalType.Int32); + using var groupNode = new GroupNode("group", Repetition.Optional, new[] {elementNode}); + using var schemaNode = new GroupNode("schema", Repetition.Required, new[] {groupNode}); + + using var buffer = new ResizableBuffer(); + using (var output = new BufferOutputStream(buffer)) + { + using var properties = WriterProperties.GetDefaultWriterProperties(); + using var fileWriter = new ParquetFileWriter(output, schemaNode, properties); + using var rowGroupWriter = fileWriter.AppendBufferedRowGroup(); + using var colWriter = rowGroupWriter.Column(0).LogicalWriter?>(); + + colWriter.WriteBatch(values); + + fileWriter.Close(); + } + + using var input = new BufferReader(buffer); + using var fileReader = new ParquetFileReader(input); + using var rowGroupReader = fileReader.RowGroup(0); + + // Read nested + using (var colReader = rowGroupReader.Column(0).LogicalReader?>()) + { + var actual = colReader.ReadAll((int) rowGroupReader.MetaData.NumRows); + Assert.IsNotEmpty(actual); + Assert.AreEqual(values.Length, actual.Length); + for (var i = 0; i < values.Length; i++) + { + if (values[i].HasValue) + { + Assert.That(actual[i].HasValue); + Assert.That(actual[i]!.Value, Is.EqualTo(values[i]!.Value)); + } + else + { + Assert.That(actual[i].HasValue, Is.False); + } + } + } + + // Read without nesting struct + using (var colReader = rowGroupReader.Column(0).LogicalReader()) + { + var actual = colReader.ReadAll((int) rowGroupReader.MetaData.NumRows); + Assert.IsNotEmpty(actual); + Assert.AreEqual(values.Length, actual.Length); + for (var i = 0; i < values.Length; i++) + { + if (values[i].HasValue) + { + Assert.That(actual[i] != null); + Assert.That(actual[i], Is.EqualTo(values[i]!.Value.Value)); + } + else + { + Assert.That(actual[i], Is.Null); + } + } + } + + fileReader.Close(); + } + + [Test] + public static void TestStringInOptionalGroup([Values] bool requiredString) + { + Nested?[] values; + if (requiredString) + { + values = Enumerable.Range(0, 100) + .Select(i => i % 10 == 3 + ? (Nested?) null + : new Nested(i.ToString())) + .ToArray(); + } + else + { + values = Enumerable.Range(0, 100) + .Select(i => i % 10 == 3 + ? (Nested?) null + : i % 10 == 5 + ? new Nested(null) + : new Nested(i.ToString())) + .ToArray(); + } + using var stringType = LogicalType.String(); + using var elementNode = new PrimitiveNode( + "element", requiredString ? Repetition.Required : Repetition.Optional, stringType, PhysicalType.ByteArray); + using var groupNode = new GroupNode("group", Repetition.Optional, new[] {elementNode}); + using var schemaNode = new GroupNode("schema", Repetition.Required, new[] {groupNode}); + + using var buffer = new ResizableBuffer(); + using (var output = new BufferOutputStream(buffer)) + { + using var properties = WriterProperties.GetDefaultWriterProperties(); + using var fileWriter = new ParquetFileWriter(output, schemaNode, properties); + using var rowGroupWriter = fileWriter.AppendBufferedRowGroup(); + using var colWriter = rowGroupWriter.Column(0).LogicalWriter?>(); + + colWriter.WriteBatch(values); + + fileWriter.Close(); + } + + using var input = new BufferReader(buffer); + using var fileReader = new ParquetFileReader(input); + using var rowGroupReader = fileReader.RowGroup(0); + + // Read nested + using (var colReader = rowGroupReader.Column(0).LogicalReader?>()) + { + var actual = colReader.ReadAll((int) rowGroupReader.MetaData.NumRows); + Assert.IsNotEmpty(actual); + Assert.AreEqual(values.Length, actual.Length); + for (var i = 0; i < values.Length; i++) + { + if (values[i].HasValue) + { + Assert.That(actual[i].HasValue); + Assert.That(actual[i]!.Value, Is.EqualTo(values[i]!.Value)); + } + else + { + Assert.That(actual[i].HasValue, Is.False); + } + } + } + + // Read without nesting struct + using (var colReader = rowGroupReader.Column(0).LogicalReader()) + { + var actual = colReader.ReadAll((int) rowGroupReader.MetaData.NumRows); + Assert.IsNotEmpty(actual); + Assert.AreEqual(values.Length, actual.Length); + for (var i = 0; i < values.Length; i++) + { + if (values[i].HasValue && values[i]!.Value.Value != null) + { + Assert.That(actual[i] != null); + Assert.That(actual[i], Is.EqualTo(values[i]!.Value.Value)); + } + else + { + Assert.That(actual[i], Is.Null); + } + } + } + + fileReader.Close(); + } + [Test] public static void TestRequiredString() { diff --git a/csharp/LogicalBatchReader/LogicalBatchReaderFactory.cs b/csharp/LogicalBatchReader/LogicalBatchReaderFactory.cs index 2d25bd7a..03988ff2 100644 --- a/csharp/LogicalBatchReader/LogicalBatchReaderFactory.cs +++ b/csharp/LogicalBatchReader/LogicalBatchReaderFactory.cs @@ -56,7 +56,9 @@ public ILogicalBatchReader GetReader(Node[] schemaNodes) // Otherwise we have a more complex structure and use a buffered reader var leafDefinitionLevel = (short) schemaNodes.Count(n => n.Repetition != Repetition.Required); - var nullableLeafValues = schemaNodes.Last().Repetition == Repetition.Optional; + // Reference typed leaf values are always treated as nullable as the converters are created based on the + // .NET type and don't consider the schema nullability. + var nullableLeafValues = schemaNodes.Last().Repetition == Repetition.Optional || !typeof(TLogical).IsValueType; _bufferedReader = new BufferedReader( _physicalReader, _converter, _buffers.Values, _buffers.DefLevels, _buffers.RepLevels, leafDefinitionLevel, nullableLeafValues); return GetCompoundReader(schemaNodes, 0, 0);