Skip to content

Commit 862568b

Browse files
authored
Document workarounds for writing TimeSpan values (#422)
1 parent 3182e68 commit 862568b

File tree

5 files changed

+119
-3
lines changed

5 files changed

+119
-3
lines changed

README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -97,6 +97,7 @@ For more detailed information on how to use ParquetSharp, see the following docu
9797
* [Reading and writing Arrow data](docs/Arrow.md) — how to read and write data using the [Apache Arrow format](https://arrow.apache.org/)
9898
* [Row-oriented API](docs/RowOriented.md) — a higher level API that abstracts away the column-oriented nature of Parquet files
9999
* [Custom types](docs/TypeFactories.md) — how to override the mapping between .NET and Parquet types
100+
* [Writing TimeSpan data](docs/TimeSpan.md) — interoperability with other libraries when writing TimeSpan data
100101
* [Use from PowerShell](docs/PowerShell.md)
101102

102103
## Rationale

csharp.test/TestLogicalTypeFactory.cs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -491,7 +491,7 @@ private sealed class ReadConverterFactory : LogicalReadConverterFactory
491491
{
492492
public override Delegate? GetDirectReader<TLogical, TPhysical>()
493493
{
494-
// Optional: the following is an optimisation and not stricly needed (but helps with speed).
494+
// Optional: the following is an optimisation and not strictly needed (but helps with speed).
495495
// Since VolumeInDollars is bitwise identical to float, we can read the values in-place.
496496
if (typeof(TLogical) == typeof(VolumeInDollars)) return LogicalRead.GetDirectReader<VolumeInDollars, float>();
497497
return base.GetDirectReader<TLogical, TPhysical>();

csharp.test/TestRowOrientedParquetFile.cs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -538,7 +538,7 @@ private sealed class ReadConverterFactory : LogicalReadConverterFactory
538538
{
539539
public override Delegate? GetDirectReader<TLogical, TPhysical>()
540540
{
541-
// Optional: the following is an optimisation and not stricly needed (but helps with speed).
541+
// Optional: the following is an optimisation and not strictly needed (but helps with speed).
542542
// Since VolumeInDollars is bitwise identical to float, we can read the values in-place.
543543
if (typeof(TLogical) == typeof(VolumeInDollars)) return LogicalRead.GetDirectReader<VolumeInDollars, float>();
544544
return base.GetDirectReader<TLogical, TPhysical>();

docs/RowOriented.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -164,7 +164,7 @@ private sealed class ReadConverterFactory : LogicalReadConverterFactory
164164
{
165165
public override Delegate? GetDirectReader<TLogical, TPhysical>()
166166
{
167-
// Optional: the following is an optimisation and not stricly needed (but helps with speed).
167+
// Optional: the following is an optimisation and not strictly needed (but helps with speed).
168168
// Since VolumeInDollars is bitwise identical to float, we can read the values in-place.
169169
if (typeof(TLogical) == typeof(VolumeInDollars)) return LogicalRead.GetDirectReader<VolumeInDollars, float>();
170170
return base.GetDirectReader<TLogical, TPhysical>();

docs/TimeSpan.md

Lines changed: 115 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,115 @@
1+
# Writing TimeSpan data
2+
3+
When writing `TimeSpan` values, ParquetSharp will use the Parquet
4+
[Time logical type](https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#time) by default.
5+
However, this logical type is intended to represent a time of day.
6+
This doesn't cause any problem when reading data back with ParquetSharp,
7+
but may cause issues with other Parquet libraries that expect values to be non-negative or less
8+
than 24 hours.
9+
10+
If you need to write negative `TimeSpan` values or values greater than 24 hours,
11+
and require that your Parquet files can be read by other libraries such as PyArrow,
12+
there are two main workarounds possible:
13+
14+
## Write with the Arrow API
15+
16+
If you convert your `TimeSpan` data to Arrow arrays with the `Duration` type,
17+
and use the [Arrow based API](Arrow.md), the Arrow schema can be embedded
18+
in the file and tell Arrow based consumers that the data represents a duration.
19+
Note that this requires Apache.Arrow version 15 or later:
20+
21+
```c#
22+
using Apache.Arrow;
23+
using ParquetSharp.Arrow;
24+
25+
TimeSpan[] timeSpanValues = ...;
26+
string filePath = ...;
27+
28+
// Define the schema of the Arrow data to write
29+
var durationType = Apache.Arrow.Types.DurationType.Microsecond;
30+
var schema = new Schema(new []
31+
{
32+
new Field("time", durationType, nullable: false),
33+
}, metadata: null);
34+
35+
// Build an Arrow duration array
36+
var durationBuilder = new DurationArray.Builder(durationType).Reserve(timeSpanValues.Length);
37+
for (var i = 0; i < timeSpanValues.Length; ++i)
38+
{
39+
durationBuilder.Append(timeSpanValues[i]);
40+
}
41+
42+
// Create a record batch to write
43+
var recordBatch = new RecordBatch(schema, new IArrowArray[]
44+
{
45+
durationBuilder.Build(),
46+
}, timeSpanValues.Length);
47+
48+
// Enable storing the Arrow schema as this is disabled by default,
49+
// and without this, durations will be read as plain int64 values
50+
using var arrowProperties = new ArrowWriterPropertiesBuilder()
51+
.StoreSchema()
52+
.Build();
53+
54+
using var writer = new FileWriter(filePath, schema, arrowProperties: arrowProperties);
55+
writer.WriteRecordBatch(recordBatch);
56+
writer.Close();
57+
```
58+
59+
## Write as int64
60+
61+
Alternatively, you can write `TimeSpan` values as plain int64 data by using
62+
a custom converter. For example, to write `TimeSpan`s as a number of microseconds:
63+
64+
```c#
65+
using ParquetSharp;
66+
67+
var columns = new ParquetSharp.Column[]
68+
{
69+
// Override the default logical type for TimeSpans,
70+
// and tell ParquetSharp to use a 64 bit integer logical type
71+
new Column<TimeSpan>("time", LogicalType.Int(bitWidth: 64, isSigned: true)),
72+
};
73+
74+
using var writer = new ParquetFileWriter(filePath, columns);
75+
// We need to add a custom converter factory to tell ParquetSharp how
76+
// to convert a TimeSpan to a long
77+
writer.LogicalWriteConverterFactory = new CustomWriteConverterFactory();
78+
79+
using var rowGroup = writer.AppendRowGroup();
80+
using (var timeWriter = rowGroup.NextColumn().LogicalWriter<TimeSpan>())
81+
{
82+
timeWriter.WriteBatch(timeSpanValues);
83+
}
84+
85+
writer.Close();
86+
87+
internal sealed class CustomWriteConverterFactory : LogicalWriteConverterFactory
88+
{
89+
private const long TicksPerMicrosecond = 10;
90+
91+
public override Delegate GetConverter<TLogical, TPhysical>(ColumnDescriptor columnDescriptor, ByteBuffer? byteBuffer)
92+
{
93+
if (typeof(TLogical) == typeof(TimeSpan))
94+
{
95+
return (LogicalWrite<TimeSpan, long>.Converter) ((source, _, dest, _) =>
96+
{
97+
for (var i = 0; i < source.Length; ++i)
98+
{
99+
dest[i] = source[i].Ticks / TicksPerMicrosecond;
100+
}
101+
});
102+
}
103+
104+
return base.GetConverter<TLogical, TPhysical>(columnDescriptor, byteBuffer);
105+
}
106+
}
107+
```
108+
109+
Note that when using this approach, if you read the file back with
110+
ParquetSharp the data will be read as `long` values as there's no
111+
way to tell it was originally `TimeSpan` data.
112+
To read the data back as `TimeSpan`s, you'll also need to implement
113+
a custom `LogicalReadConverterFactory` and use the `LogicalReadOverride` method
114+
or provide a custom `LogicalTypeFactory`.
115+
See the [type factories documentation](TypeFactories.md) for more details.

0 commit comments

Comments
 (0)