27
27
import org .apache .iceberg .parquet .ParquetValueReader ;
28
28
import org .apache .iceberg .parquet .ParquetValueReaders ;
29
29
import org .apache .iceberg .parquet .TypeWithSchemaVisitor ;
30
+ import org .apache .iceberg .relocated .com .google .common .base .Preconditions ;
30
31
import org .apache .iceberg .relocated .com .google .common .collect .ImmutableList ;
31
32
import org .apache .iceberg .relocated .com .google .common .collect .ImmutableMap ;
32
33
import org .apache .iceberg .relocated .com .google .common .collect .Lists ;
33
34
import org .apache .iceberg .relocated .com .google .common .collect .Maps ;
35
+ import org .apache .iceberg .types .Type .TypeID ;
34
36
import org .apache .iceberg .types .Types ;
35
37
import org .apache .parquet .column .ColumnDescriptor ;
36
38
import org .apache .parquet .schema .GroupType ;
37
39
import org .apache .parquet .schema .LogicalTypeAnnotation ;
40
+ import org .apache .parquet .schema .LogicalTypeAnnotation .DateLogicalTypeAnnotation ;
38
41
import org .apache .parquet .schema .LogicalTypeAnnotation .DecimalLogicalTypeAnnotation ;
42
+ import org .apache .parquet .schema .LogicalTypeAnnotation .EnumLogicalTypeAnnotation ;
43
+ import org .apache .parquet .schema .LogicalTypeAnnotation .IntLogicalTypeAnnotation ;
44
+ import org .apache .parquet .schema .LogicalTypeAnnotation .JsonLogicalTypeAnnotation ;
45
+ import org .apache .parquet .schema .LogicalTypeAnnotation .LogicalTypeAnnotationVisitor ;
46
+ import org .apache .parquet .schema .LogicalTypeAnnotation .StringLogicalTypeAnnotation ;
47
+ import org .apache .parquet .schema .LogicalTypeAnnotation .TimeLogicalTypeAnnotation ;
48
+ import org .apache .parquet .schema .LogicalTypeAnnotation .TimestampLogicalTypeAnnotation ;
39
49
import org .apache .parquet .schema .MessageType ;
40
50
import org .apache .parquet .schema .PrimitiveType ;
41
51
import org .apache .parquet .schema .Type ;
@@ -78,8 +88,12 @@ protected ParquetValueReader<?> dateReader(ColumnDescriptor desc) {
78
88
return new GenericParquetReaders .DateReader (desc );
79
89
}
80
90
81
- protected ParquetValueReader <?> timeReader (
82
- ColumnDescriptor desc , LogicalTypeAnnotation .TimeUnit unit ) {
91
+ protected ParquetValueReader <?> timeReader (ColumnDescriptor desc ) {
92
+ LogicalTypeAnnotation time = desc .getPrimitiveType ().getLogicalTypeAnnotation ();
93
+ Preconditions .checkArgument (
94
+ time instanceof TimeLogicalTypeAnnotation , "Invalid time logical type: " + time );
95
+
96
+ LogicalTypeAnnotation .TimeUnit unit = ((TimeLogicalTypeAnnotation ) time ).getUnit ();
83
97
switch (unit ) {
84
98
case MICROS :
85
99
return new GenericParquetReaders .TimeReader (desc );
@@ -90,12 +104,17 @@ protected ParquetValueReader<?> timeReader(
90
104
}
91
105
}
92
106
93
- protected ParquetValueReader <?> timestampReader (
94
- ColumnDescriptor desc , LogicalTypeAnnotation .TimeUnit unit , boolean isAdjustedToUTC ) {
107
+ protected ParquetValueReader <?> timestampReader (ColumnDescriptor desc , boolean isAdjustedToUTC ) {
95
108
if (desc .getPrimitiveType ().getPrimitiveTypeName () == PrimitiveType .PrimitiveTypeName .INT96 ) {
96
109
return new GenericParquetReaders .TimestampInt96Reader (desc );
97
110
}
98
111
112
+ LogicalTypeAnnotation timestamp = desc .getPrimitiveType ().getLogicalTypeAnnotation ();
113
+ Preconditions .checkArgument (
114
+ timestamp instanceof TimestampLogicalTypeAnnotation ,
115
+ "Invalid timestamp logical type: " + timestamp );
116
+
117
+ LogicalTypeAnnotation .TimeUnit unit = ((TimestampLogicalTypeAnnotation ) timestamp ).getUnit ();
99
118
switch (unit ) {
100
119
case MICROS :
101
120
return isAdjustedToUTC
@@ -148,96 +167,79 @@ public ParquetValueReader<?> struct(
148
167
}
149
168
}
150
169
151
- private class LogicalTypeAnnotationParquetValueReaderVisitor
152
- implements LogicalTypeAnnotation . LogicalTypeAnnotationVisitor <ParquetValueReader <?>> {
170
+ private class LogicalTypeReadBuilder
171
+ implements LogicalTypeAnnotationVisitor <ParquetValueReader <?>> {
153
172
154
173
private final ColumnDescriptor desc ;
155
174
private final org .apache .iceberg .types .Type .PrimitiveType expected ;
156
- private final PrimitiveType primitive ;
157
175
158
- LogicalTypeAnnotationParquetValueReaderVisitor (
159
- ColumnDescriptor desc ,
160
- org .apache .iceberg .types .Type .PrimitiveType expected ,
161
- PrimitiveType primitive ) {
176
+ LogicalTypeReadBuilder (
177
+ ColumnDescriptor desc , org .apache .iceberg .types .Type .PrimitiveType expected ) {
162
178
this .desc = desc ;
163
179
this .expected = expected ;
164
- this .primitive = primitive ;
165
180
}
166
181
167
182
@ Override
168
- public Optional <ParquetValueReader <?>> visit (
169
- LogicalTypeAnnotation .StringLogicalTypeAnnotation stringLogicalType ) {
170
- return Optional .of (new ParquetValueReaders .StringReader (desc ));
183
+ public Optional <ParquetValueReader <?>> visit (StringLogicalTypeAnnotation stringLogicalType ) {
184
+ return Optional .of (ParquetValueReaders .strings (desc ));
171
185
}
172
186
173
187
@ Override
174
- public Optional <ParquetValueReader <?>> visit (
175
- LogicalTypeAnnotation .EnumLogicalTypeAnnotation enumLogicalType ) {
176
- return Optional .of (new ParquetValueReaders .StringReader (desc ));
188
+ public Optional <ParquetValueReader <?>> visit (EnumLogicalTypeAnnotation enumLogicalType ) {
189
+ return Optional .of (ParquetValueReaders .strings (desc ));
177
190
}
178
191
179
192
@ Override
180
193
public Optional <ParquetValueReader <?>> visit (DecimalLogicalTypeAnnotation decimalLogicalType ) {
181
- switch (primitive .getPrimitiveTypeName ()) {
182
- case BINARY :
183
- case FIXED_LEN_BYTE_ARRAY :
184
- return Optional .of (
185
- new ParquetValueReaders .BinaryAsDecimalReader (desc , decimalLogicalType .getScale ()));
186
- case INT64 :
187
- return Optional .of (
188
- new ParquetValueReaders .LongAsDecimalReader (desc , decimalLogicalType .getScale ()));
189
- case INT32 :
190
- return Optional .of (
191
- new ParquetValueReaders .IntegerAsDecimalReader (desc , decimalLogicalType .getScale ()));
192
- default :
193
- throw new UnsupportedOperationException (
194
- "Unsupported base type for decimal: " + primitive .getPrimitiveTypeName ());
195
- }
194
+ return Optional .of (ParquetValueReaders .bigDecimals (desc ));
196
195
}
197
196
198
197
@ Override
199
- public Optional <ParquetValueReader <?>> visit (
200
- LogicalTypeAnnotation .DateLogicalTypeAnnotation dateLogicalType ) {
198
+ public Optional <ParquetValueReader <?>> visit (DateLogicalTypeAnnotation dateLogicalType ) {
201
199
return Optional .of (dateReader (desc ));
202
200
}
203
201
204
202
@ Override
205
- public Optional <ParquetValueReader <?>> visit (
206
- LogicalTypeAnnotation .TimeLogicalTypeAnnotation timeLogicalType ) {
207
- return Optional .of (timeReader (desc , timeLogicalType .getUnit ()));
203
+ public Optional <ParquetValueReader <?>> visit (TimeLogicalTypeAnnotation timeLogicalType ) {
204
+ return Optional .of (timeReader (desc ));
208
205
}
209
206
210
207
@ Override
211
208
public Optional <ParquetValueReader <?>> visit (
212
- LogicalTypeAnnotation . TimestampLogicalTypeAnnotation timestampLogicalType ) {
209
+ TimestampLogicalTypeAnnotation timestampLogicalType ) {
213
210
return Optional .of (
214
- timestampReader (
215
- desc ,
216
- timestampLogicalType .getUnit (),
217
- ((Types .TimestampType ) expected ).shouldAdjustToUTC ()));
211
+ timestampReader (desc , ((Types .TimestampType ) expected ).shouldAdjustToUTC ()));
218
212
}
219
213
220
214
@ Override
221
- public Optional <ParquetValueReader <?>> visit (
222
- LogicalTypeAnnotation .IntLogicalTypeAnnotation intLogicalType ) {
215
+ public Optional <ParquetValueReader <?>> visit (IntLogicalTypeAnnotation intLogicalType ) {
223
216
if (intLogicalType .getBitWidth () == 64 ) {
217
+ Preconditions .checkArgument (
218
+ intLogicalType .isSigned (), "Cannot read UINT64 as a long value" );
219
+
224
220
return Optional .of (new ParquetValueReaders .UnboxedReader <>(desc ));
225
221
}
226
- return (expected .typeId () == org .apache .iceberg .types .Type .TypeID .LONG )
227
- ? Optional .of (new ParquetValueReaders .IntAsLongReader (desc ))
228
- : Optional .of (new ParquetValueReaders .UnboxedReader <>(desc ));
222
+
223
+ if (expected .typeId () == TypeID .LONG ) {
224
+ return Optional .of (new ParquetValueReaders .IntAsLongReader (desc ));
225
+ }
226
+
227
+ Preconditions .checkArgument (
228
+ intLogicalType .isSigned () || intLogicalType .getBitWidth () < 32 ,
229
+ "Cannot read UINT32 as an int value" );
230
+
231
+ return Optional .of (new ParquetValueReaders .UnboxedReader <>(desc ));
229
232
}
230
233
231
234
@ Override
232
- public Optional <ParquetValueReader <?>> visit (
233
- LogicalTypeAnnotation .JsonLogicalTypeAnnotation jsonLogicalType ) {
234
- return Optional .of (new ParquetValueReaders .StringReader (desc ));
235
+ public Optional <ParquetValueReader <?>> visit (JsonLogicalTypeAnnotation jsonLogicalType ) {
236
+ return Optional .of (ParquetValueReaders .strings (desc ));
235
237
}
236
238
237
239
@ Override
238
240
public Optional <ParquetValueReader <?>> visit (
239
241
LogicalTypeAnnotation .BsonLogicalTypeAnnotation bsonLogicalType ) {
240
- return Optional .of (new ParquetValueReaders .BytesReader (desc ));
242
+ return Optional .of (ParquetValueReaders .byteBuffers (desc ));
241
243
}
242
244
243
245
@ Override
@@ -388,7 +390,7 @@ public ParquetValueReader<?> primitive(
388
390
if (primitive .getLogicalTypeAnnotation () != null ) {
389
391
return primitive
390
392
.getLogicalTypeAnnotation ()
391
- .accept (new LogicalTypeAnnotationParquetValueReaderVisitor (desc , expected , primitive ))
393
+ .accept (new LogicalTypeReadBuilder (desc , expected ))
392
394
.orElseThrow (
393
395
() ->
394
396
new UnsupportedOperationException (
@@ -399,31 +401,31 @@ public ParquetValueReader<?> primitive(
399
401
case FIXED_LEN_BYTE_ARRAY :
400
402
return fixedReader (desc );
401
403
case BINARY :
402
- if (expected .typeId () == org . apache . iceberg . types . Type . TypeID .STRING ) {
403
- return new ParquetValueReaders .StringReader (desc );
404
+ if (expected .typeId () == TypeID .STRING ) {
405
+ return ParquetValueReaders .strings (desc );
404
406
} else {
405
- return new ParquetValueReaders .BytesReader (desc );
407
+ return ParquetValueReaders .byteBuffers (desc );
406
408
}
407
409
case INT32 :
408
- if (expected .typeId () == org . apache . iceberg . types . Type . TypeID .LONG ) {
409
- return new ParquetValueReaders .IntAsLongReader (desc );
410
+ if (expected .typeId () == TypeID .LONG ) {
411
+ return ParquetValueReaders .intsAsLongs (desc );
410
412
} else {
411
- return new ParquetValueReaders .UnboxedReader <> (desc );
413
+ return ParquetValueReaders .unboxed (desc );
412
414
}
413
415
case FLOAT :
414
- if (expected .typeId () == org . apache . iceberg . types . Type . TypeID .DOUBLE ) {
415
- return new ParquetValueReaders .FloatAsDoubleReader (desc );
416
+ if (expected .typeId () == TypeID .DOUBLE ) {
417
+ return ParquetValueReaders .floatsAsDoubles (desc );
416
418
} else {
417
- return new ParquetValueReaders .UnboxedReader <> (desc );
419
+ return ParquetValueReaders .unboxed (desc );
418
420
}
419
421
case BOOLEAN :
420
422
case INT64 :
421
423
case DOUBLE :
422
- return new ParquetValueReaders .UnboxedReader <> (desc );
424
+ return ParquetValueReaders .unboxed (desc );
423
425
case INT96 :
424
426
// Impala & Spark used to write timestamps as INT96 without a logical type. For backwards
425
427
// compatibility we try to read INT96 as timestamps.
426
- return timestampReader (desc , LogicalTypeAnnotation . TimeUnit . NANOS , true );
428
+ return timestampReader (desc , true );
427
429
default :
428
430
throw new UnsupportedOperationException ("Unsupported type: " + primitive );
429
431
}
0 commit comments