|
| 1 | +# Writing TimeSpan data |
| 2 | + |
| 3 | +When writing `TimeSpan` values, ParquetSharp will use the Parquet |
| 4 | +[Time logical type](https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#time) by default. |
| 5 | +However, this logical type is intended to represent a time of day. |
| 6 | +This doesn't cause any problem when reading data back with ParquetSharp, |
| 7 | +but may cause issues with other Parquet libraries that expect values to be non-negative or less |
| 8 | +than 24 hours. |
| 9 | + |
| 10 | +If you need to write negative `TimeSpan` values or values greater than 24 hours, |
| 11 | +and require that your Parquet files can be read by other libraries such as PyArrow, |
| 12 | +there are two main workarounds possible: |
| 13 | + |
| 14 | +## Write with the Arrow API |
| 15 | + |
| 16 | +If you convert your `TimeSpan` data to Arrow arrays with the `Duration` type, |
| 17 | +and use the [Arrow based API](Arrow.md), the Arrow schema can be embedded |
| 18 | +in the file and tell Arrow based consumers that the data represents a duration. |
| 19 | +Note that this requires Apache.Arrow version 15 or later: |
| 20 | + |
| 21 | +```c# |
| 22 | +using Apache.Arrow; |
| 23 | +using ParquetSharp.Arrow; |
| 24 | + |
| 25 | +TimeSpan[] timeSpanValues = ...; |
| 26 | +string filePath = ...; |
| 27 | + |
| 28 | +// Define the schema of the Arrow data to write |
| 29 | +var durationType = Apache.Arrow.Types.DurationType.Microsecond; |
| 30 | +var schema = new Schema(new [] |
| 31 | +{ |
| 32 | + new Field("time", durationType, nullable: false), |
| 33 | +}, metadata: null); |
| 34 | + |
| 35 | +// Build an Arrow duration array |
| 36 | +var durationBuilder = new DurationArray.Builder(durationType).Reserve(timeSpanValues.Length); |
| 37 | +for (var i = 0; i < timeSpanValues.Length; ++i) |
| 38 | +{ |
| 39 | + durationBuilder.Append(timeSpanValues[i]); |
| 40 | +} |
| 41 | + |
| 42 | +// Create a record batch to write |
| 43 | +var recordBatch = new RecordBatch(schema, new IArrowArray[] |
| 44 | +{ |
| 45 | + durationBuilder.Build(), |
| 46 | +}, timeSpanValues.Length); |
| 47 | + |
| 48 | +// Enable storing the Arrow schema as this is disabled by default, |
| 49 | +// and without this, durations will be read as plain int64 values |
| 50 | +using var arrowProperties = new ArrowWriterPropertiesBuilder() |
| 51 | + .StoreSchema() |
| 52 | + .Build(); |
| 53 | + |
| 54 | +using var writer = new FileWriter(filePath, schema, arrowProperties: arrowProperties); |
| 55 | +writer.WriteRecordBatch(recordBatch); |
| 56 | +writer.Close(); |
| 57 | +``` |
| 58 | + |
| 59 | +## Write as int64 |
| 60 | + |
| 61 | +Alternatively, you can write `TimeSpan` values as plain int64 data by using |
| 62 | +a custom converter. For example, to write `TimeSpan`s as a number of microseconds: |
| 63 | + |
| 64 | +```c# |
| 65 | +using ParquetSharp; |
| 66 | + |
| 67 | +var columns = new ParquetSharp.Column[] |
| 68 | +{ |
| 69 | + // Override the default logical type for TimeSpans, |
| 70 | + // and tell ParquetSharp to use a 64 bit integer logical type |
| 71 | + new Column<TimeSpan>("time", LogicalType.Int(bitWidth: 64, isSigned: true)), |
| 72 | +}; |
| 73 | + |
| 74 | +using var writer = new ParquetFileWriter(filePath, columns); |
| 75 | +// We need to add a custom converter factory to tell ParquetSharp how |
| 76 | +// to convert a TimeSpan to a long |
| 77 | +writer.LogicalWriteConverterFactory = new CustomWriteConverterFactory(); |
| 78 | + |
| 79 | +using var rowGroup = writer.AppendRowGroup(); |
| 80 | +using (var timeWriter = rowGroup.NextColumn().LogicalWriter<TimeSpan>()) |
| 81 | +{ |
| 82 | + timeWriter.WriteBatch(timeSpanValues); |
| 83 | +} |
| 84 | + |
| 85 | +writer.Close(); |
| 86 | + |
| 87 | +internal sealed class CustomWriteConverterFactory : LogicalWriteConverterFactory |
| 88 | +{ |
| 89 | + private const long TicksPerMicrosecond = 10; |
| 90 | + |
| 91 | + public override Delegate GetConverter<TLogical, TPhysical>(ColumnDescriptor columnDescriptor, ByteBuffer? byteBuffer) |
| 92 | + { |
| 93 | + if (typeof(TLogical) == typeof(TimeSpan)) |
| 94 | + { |
| 95 | + return (LogicalWrite<TimeSpan, long>.Converter) ((source, _, dest, _) => |
| 96 | + { |
| 97 | + for (var i = 0; i < source.Length; ++i) |
| 98 | + { |
| 99 | + dest[i] = source[i].Ticks / TicksPerMicrosecond; |
| 100 | + } |
| 101 | + }); |
| 102 | + } |
| 103 | + |
| 104 | + return base.GetConverter<TLogical, TPhysical>(columnDescriptor, byteBuffer); |
| 105 | + } |
| 106 | +} |
| 107 | +``` |
| 108 | + |
| 109 | +Note that when using this approach, if you read the file back with |
| 110 | +ParquetSharp the data will be read as `long` values as there's no |
| 111 | +way to tell it was originally `TimeSpan` data. |
| 112 | +To read the data back as `TimeSpan`s, you'll also need to implement |
| 113 | +a custom `LogicalReadConverterFactory` and use the `LogicalReadOverride` method |
| 114 | +or provide a custom `LogicalTypeFactory`. |
| 115 | +See the [type factories documentation](TypeFactories.md) for more details. |
0 commit comments