diff --git a/docs/guides/Reading.md b/docs/guides/Reading.md index 5355d869..c6d1fc91 100644 --- a/docs/guides/Reading.md +++ b/docs/guides/Reading.md @@ -63,29 +63,8 @@ DateTime[] timestamps = rowGroupReader.Column(0).LogicalReader().ReadA ### Reading columns with unknown types -However, if you don't know ahead of time the types for each column, you can implement the -`ILogicalColumnReaderVisitor` interface to handle column data in a type-safe way, for example: - -```csharp -sealed class ColumnPrinter : ILogicalColumnReaderVisitor -{ - public string OnLogicalColumnReader(LogicalColumnReader columnReader) - { - var stringBuilder = new StringBuilder(); - foreach (var value in columnReader) { - stringBuilder.Append(value?.ToString() ?? "null"); - stringBuilder.Append(","); - } - return stringBuilder.ToString(); - } -} - -string columnValues = rowGroupReader.Column(0).LogicalReader().Apply(new ColumnPrinter()); -``` - -There's a similar `IColumnReaderVisitor` interface for working with `ColumnReader` objects -and reading physical values in a type-safe way, but most users will want to work at the logical element level. - +If you don't know ahead of time the types for each column, use the visitor-based guide: +See [Visitor patterns: reading & writing with unknown column types](VisitorPatterns.md) for examples using `ILogicalColumnReaderVisitor` and related visitor types. ### Reading data in batches diff --git a/docs/guides/VisitorPatterns.md b/docs/guides/VisitorPatterns.md new file mode 100644 index 00000000..bed8d4dd --- /dev/null +++ b/docs/guides/VisitorPatterns.md @@ -0,0 +1,227 @@ +# Visitor patterns: reading & writing with unknown column types + +ParquetSharp exposes a number of "visitor" interfaces that make it convenient to read or write columns when you don't know the concrete column types at compile time. These visitors let you write type-safe code that is invoked for the actual column element type at runtime. + +## ILogicalColumnWriterVisitor + +The @ParquetSharp.ILogicalColumnWriterVisitor`1 interface is invoked for logical writers (high-level typed writers). Use this when you need to write data to columns but don't know the column types at compile time. + +### Example: Generic column writer + +```csharp +// A visitor that writes arrays of values to any column type +sealed class GenericColumnWriter : ILogicalColumnWriterVisitor +{ + private readonly IDictionary _valuesByColumn; + + public GenericColumnWriter(IDictionary valuesByColumn) + { + _valuesByColumn = valuesByColumn; + } + + public bool OnLogicalColumnWriter(LogicalColumnWriter columnWriter) + { + // Look up values for this column name + if (!_valuesByColumn.TryGetValue(columnWriter.ColumnDescriptor.Path.ToDotString(), out var raw)) + return false; + + var values = (TValue[])raw; + columnWriter.WriteBatch(values); + return true; + } +} + +// Usage +var valuesByColumn = new Dictionary +{ + { "Id", new[] { 1, 2, 3 } }, + { "Name", new[] { "Alice", "Bob", "Carol" } }, + { "Price", new[] { 9.99, 12.50, 5.75 } } +}; + +using var logicalWriter = columnWriter.LogicalWriter(); +var success = logicalWriter.Apply(new GenericColumnWriter(valuesByColumn)); +``` + +#### Casting arrays safely + +The `(TValue[])array` cast pattern is safe when the visitor is invoked with the concrete `TValue` type that matches your stored array element type. Always ensure your stored arrays match the declared column types to avoid runtime exceptions. + + +### Example: Conditional writer based on type + +```csharp +// A visitor that only writes numeric columns, skipping others +sealed class NumericOnlyWriter : ILogicalColumnWriterVisitor +{ + private readonly double _fillValue; + + public NumericOnlyWriter(double fillValue) => _fillValue = fillValue; + + public bool OnLogicalColumnWriter(LogicalColumnWriter columnWriter) + { + TValue val; + if (typeof(TValue) == typeof(int) || + typeof(TValue) == typeof(double) || + typeof(TValue) == typeof(float) || + typeof(TValue) == typeof(long)) + { + // Convert _fillValue to the correct TValue + val = (TValue)Convert.ChangeType(_fillValue, typeof(TValue)); + } + else + { + // write default(TValue) so the row count matches + val = default!; + } + + var arr = new TValue[] { val }; + columnWriter.WriteBatch(arr); + return true; + } +} +``` + +## ILogicalColumnReaderVisitor + +The @ParquetSharp.ILogicalColumnReaderVisitor`1 interface is invoked for logical readers (high-level typed readers). Use this when you need to read data from columns of unknown types. + +### Example: Convert columns to strings + +```csharp +// A visitor that reads all values and returns them as a comma-separated string +sealed class ColumnToStringReader : ILogicalColumnReaderVisitor +{ + public string OnLogicalColumnReader(LogicalColumnReader columnReader) + { + var sb = new StringBuilder(); + const int bufferSize = 1024; + var buffer = new TElement[bufferSize]; + + while (columnReader.HasNext) + { + var read = columnReader.ReadBatch(buffer); + for (var i = 0; i < read; ++i) + { + sb.Append(buffer[i]?.ToString() ?? "null"); + sb.Append(", "); + } + } + + if (sb.Length >= 2) sb.Length -= 2; + return sb.ToString(); + } +} + +// Usage +using var logicalReader = columnReader.LogicalReader(); +var columnString = logicalReader.Apply(new ColumnToStringReader()); +Console.WriteLine($"Column data: {columnString}"); +``` + +### Example: Calculate column statistics + +```csharp +// A visitor that computes row count for any column type +sealed class RowCountReader : ILogicalColumnReaderVisitor +{ + public long OnLogicalColumnReader(LogicalColumnReader columnReader) + { + long count = 0; + const int bufferSize = 1024; + var buffer = new TElement[bufferSize]; + + while (columnReader.HasNext) + { + count += columnReader.ReadBatch(buffer); + } + + return count; + } +} + +// Usage +using var logicalReader = columnReader.LogicalReader(); +var rowCount = logicalReader.Apply(new RowCountReader()); +Console.WriteLine($"Total rows: {rowCount}"); +``` + +## IColumnWriterVisitor + +The @ParquetSharp.IColumnWriterVisitor`1 interface provides lower-level access to physical column writers. Use this when you need to work with physical types, definition levels, repetition levels, or encodings. + +### Example: Physical type inspector + +```csharp +// A visitor that reports the physical type being written +sealed class PhysicalTypeWriter : IColumnWriterVisitor +{ + public string OnColumnWriter(ColumnWriter columnWriter) + where TValue : unmanaged + { + var physicalType = typeof(TValue).Name; + Console.WriteLine($"Writing physical type: {physicalType}"); + + // Could perform low-level writes here if needed + // columnWriter.WriteBatch(..., definitionLevels, repetitionLevels); + + return physicalType; + } +} +``` + +## IColumnReaderVisitor + +The @ParquetSharp.IColumnReaderVisitor`1 interface provides lower-level access to physical column readers. Use this for low-level operations that require access to definition levels, repetition levels, or physical encodings. + +### Example: Definition level analyzer + +```csharp +// A visitor that counts null values using definition levels +sealed class NullCountReader : IColumnReaderVisitor +{ + public int OnColumnReader(ColumnReader columnReader) + where TValue : unmanaged + { + const int bufferSize = 1024; + var values = new TValue[bufferSize]; + var defLevels = new short[bufferSize]; + var repLevels = new short[bufferSize]; + int nullCount = 0; + + while (columnReader.HasNext) + { + var read = columnReader.ReadBatch(bufferSize, defLevels, repLevels, values, out var valuesRead); + + // Count definition levels that indicate null + for (int i = 0; i < read; i++) + { + if (defLevels[i] < columnReader.ColumnDescriptor.MaxDefinitionLevel) + { + nullCount++; + } + } + } + + return nullCount; + } +} +``` + +## IColumnDescriptorVisitor + +The @ParquetSharp.IColumnDescriptorVisitor`1 interface visits column descriptors (schema metadata) without performing any I/O. Use this when you only need to inspect or process schema information. + +## Best practices + +### When to use each visitor type + +- **ILogicalColumnWriterVisitor / ILogicalColumnReaderVisitor**: Use for high-level, type-safe reading and writing when column types are unknown at compile time. Ideal for generic tooling, schema-driven processing, and data exporters. + +- **IColumnWriterVisitor / IColumnReaderVisitor**: Use for low-level operations requiring access to definition levels, repetition levels, or physical encodings. + +- **IColumnDescriptorVisitor**: Use when you only need to inspect schema metadata without performing I/O. Perfect for schema validation, type checking, and metadata extraction. + +### When to avoid visitors + +If you already know the schema at compile time, prefer the generic `LogicalWriter` / `LogicalReader` APIs — they are simpler and more maintainable. \ No newline at end of file diff --git a/docs/guides/Writing.md b/docs/guides/Writing.md index 728da5d7..66e246e0 100644 --- a/docs/guides/Writing.md +++ b/docs/guides/Writing.md @@ -121,28 +121,8 @@ There is also a `ColumnWriter.LogicalWriterOverride` method, which supports writ to the default .NET type corresponding to the column's logical type. For more information on how to use this, see the [type factories documentation](TypeFactories.md). -If you don't know ahead of time the column types that will be written, -you can implement the `ILogicalColumnWriterVisitor` interface to handle writing data in a type-safe way: - -```csharp -sealed class ExampleWriter : ILogicalColumnWriterVisitor -{ - public bool OnLogicalColumnWriter(LogicalColumnWriter columnWriter) - { - TValue[] values = GetValues(); - columnWriter.WriteBatch(values); - return true; - } -} - -using RowGroupWriter rowGroup = file.AppendRowGroup(); -for (int columnIndex = 0; columnIndex < file.NumColumns; ++columnIndex) -{ - using var columnWriter = rowGroup.NextColumn(); - using var logicalWriter = columnWriter.LogicalWriter(); - var returnVal = logicalWriter.Apply(new ExampleWriter()); -} -``` +If you don't know ahead of time the column types that will be written, see the visitor-pattern guide: +[Visitor patterns: reading & writing with unknown column types](VisitorPatterns.md) — it includes a full example demonstrating writing and then reading a file with mixed column types using `ILogicalColumnWriterVisitor` and `ILogicalColumnReaderVisitor`. ### Closing the ParquetFileWriter diff --git a/docs/guides/toc.yml b/docs/guides/toc.yml index bcaa9594..7b4db103 100644 --- a/docs/guides/toc.yml +++ b/docs/guides/toc.yml @@ -16,3 +16,5 @@ href: TimeSpan.md - name: Use from PowerShell href: PowerShell.md +- name: Visitor Patterns + href: VisitorPatterns.md