18
18
import uuid
19
19
from datetime import date , datetime , timedelta , timezone
20
20
from decimal import Decimal
21
- from typing import Any , Callable , List , Optional
21
+ from typing import Any , List
22
22
23
23
import pytest
24
24
from pyspark .sql import SparkSession
25
25
from pyspark .sql .utils import AnalysisException
26
26
27
27
from pyiceberg .catalog import Catalog
28
28
from pyiceberg .partitioning import PartitionField , PartitionFieldValue , PartitionKey , PartitionSpec
29
- from pyiceberg .schema import Schema
29
+ from pyiceberg .schema import Schema , make_compatible_name
30
30
from pyiceberg .transforms import (
31
31
BucketTransform ,
32
32
DayTransform ,
78
78
79
79
80
80
@pytest .mark .parametrize (
81
- "partition_fields, partition_values, expected_partition_record, expected_hive_partition_path_slice, spark_create_table_sql_for_justification, spark_data_insert_sql_for_justification, make_compatible_name " ,
81
+ "partition_fields, partition_values, expected_partition_record, expected_hive_partition_path_slice, spark_create_table_sql_for_justification, spark_data_insert_sql_for_justification" ,
82
82
[
83
83
# # Identity Transform
84
84
(
99
99
VALUES
100
100
(false, 'Boolean field set to false');
101
101
""" ,
102
- None ,
103
102
),
104
103
(
105
104
[PartitionField (source_id = 2 , field_id = 1001 , transform = IdentityTransform (), name = "string_field" )],
119
118
VALUES
120
119
('sample_string', 'Another string value')
121
120
""" ,
122
- None ,
123
121
),
124
122
(
125
123
[PartitionField (source_id = 4 , field_id = 1001 , transform = IdentityTransform (), name = "int_field" )],
139
137
VALUES
140
138
(42, 'Associated string value for int 42')
141
139
""" ,
142
- None ,
143
140
),
144
141
(
145
142
[PartitionField (source_id = 5 , field_id = 1001 , transform = IdentityTransform (), name = "long_field" )],
159
156
VALUES
160
157
(1234567890123456789, 'Associated string value for long 1234567890123456789')
161
158
""" ,
162
- None ,
163
159
),
164
160
(
165
161
[PartitionField (source_id = 6 , field_id = 1001 , transform = IdentityTransform (), name = "float_field" )],
183
179
# VALUES
184
180
# (3.14, 'Associated string value for float 3.14')
185
181
# """
186
- None ,
187
182
),
188
183
(
189
184
[PartitionField (source_id = 7 , field_id = 1001 , transform = IdentityTransform (), name = "double_field" )],
207
202
# VALUES
208
203
# (6.282, 'Associated string value for double 6.282')
209
204
# """
210
- None ,
211
205
),
212
206
(
213
207
[PartitionField (source_id = 8 , field_id = 1001 , transform = IdentityTransform (), name = "timestamp_field" )],
227
221
VALUES
228
222
(CAST('2023-01-01 12:00:01.000999' AS TIMESTAMP_NTZ), 'Associated string value for timestamp 2023-01-01T12:00:00')
229
223
""" ,
230
- None ,
231
224
),
232
225
(
233
226
[PartitionField (source_id = 8 , field_id = 1001 , transform = IdentityTransform (), name = "timestamp_field" )],
247
240
VALUES
248
241
(CAST('2023-01-01 12:00:01' AS TIMESTAMP_NTZ), 'Associated string value for timestamp 2023-01-01T12:00:00')
249
242
""" ,
250
- None ,
251
243
),
252
244
(
253
245
[PartitionField (source_id = 8 , field_id = 1001 , transform = IdentityTransform (), name = "timestamp_field" )],
272
264
# VALUES
273
265
# (CAST('2023-01-01 12:00:00' AS TIMESTAMP_NTZ), 'Associated string value for timestamp 2023-01-01T12:00:00')
274
266
# """
275
- None ,
276
267
),
277
268
(
278
269
[PartitionField (source_id = 9 , field_id = 1001 , transform = IdentityTransform (), name = "timestamptz_field" )],
297
288
# VALUES
298
289
# (CAST('2023-01-01 12:00:01.000999+03:00' AS TIMESTAMP), 'Associated string value for timestamp 2023-01-01 12:00:01.000999+03:00')
299
290
# """
300
- None ,
301
291
),
302
292
(
303
293
[PartitionField (source_id = 10 , field_id = 1001 , transform = IdentityTransform (), name = "date_field" )],
317
307
VALUES
318
308
(CAST('2023-01-01' AS DATE), 'Associated string value for date 2023-01-01')
319
309
""" ,
320
- None ,
321
310
),
322
311
(
323
312
[PartitionField (source_id = 14 , field_id = 1001 , transform = IdentityTransform (), name = "uuid_field" )],
337
326
VALUES
338
327
('f47ac10b-58cc-4372-a567-0e02b2c3d479', 'Associated string value for UUID f47ac10b-58cc-4372-a567-0e02b2c3d479')
339
328
""" ,
340
- None ,
341
329
),
342
330
(
343
331
[PartitionField (source_id = 11 , field_id = 1001 , transform = IdentityTransform (), name = "binary_field" )],
357
345
VALUES
358
346
(CAST('example' AS BINARY), 'Associated string value for binary `example`')
359
347
""" ,
360
- None ,
361
348
),
362
349
(
363
350
[PartitionField (source_id = 13 , field_id = 1001 , transform = IdentityTransform (), name = "decimal_field" )],
377
364
VALUES
378
365
(123.45, 'Associated string value for decimal 123.45')
379
366
""" ,
380
- None ,
381
367
),
382
368
# # Year Month Day Hour Transform
383
369
# Month Transform
399
385
VALUES
400
386
(CAST('2023-01-01 11:55:59.999999' AS TIMESTAMP_NTZ), 'Event at 2023-01-01 11:55:59.999999');
401
387
""" ,
402
- None ,
403
388
),
404
389
(
405
390
[PartitionField (source_id = 9 , field_id = 1001 , transform = MonthTransform (), name = "timestamptz_field_month" )],
419
404
VALUES
420
405
(CAST('2023-01-01 12:00:01.000999+03:00' AS TIMESTAMP), 'Event at 2023-01-01 12:00:01.000999+03:00');
421
406
""" ,
422
- None ,
423
407
),
424
408
(
425
409
[PartitionField (source_id = 10 , field_id = 1001 , transform = MonthTransform (), name = "date_field_month" )],
439
423
VALUES
440
424
(CAST('2023-01-01' AS DATE), 'Event on 2023-01-01');
441
425
""" ,
442
- None ,
443
426
),
444
427
# Year Transform
445
428
(
460
443
VALUES
461
444
(CAST('2023-01-01 11:55:59.999999' AS TIMESTAMP), 'Event at 2023-01-01 11:55:59.999999');
462
445
""" ,
463
- None ,
464
446
),
465
447
(
466
448
[PartitionField (source_id = 9 , field_id = 1001 , transform = YearTransform (), name = "timestamptz_field_year" )],
480
462
VALUES
481
463
(CAST('2023-01-01 12:00:01.000999+03:00' AS TIMESTAMP), 'Event at 2023-01-01 12:00:01.000999+03:00');
482
464
""" ,
483
- None ,
484
465
),
485
466
(
486
467
[PartitionField (source_id = 10 , field_id = 1001 , transform = YearTransform (), name = "date_field_year" )],
500
481
VALUES
501
482
(CAST('2023-01-01' AS DATE), 'Event on 2023-01-01');
502
483
""" ,
503
- None ,
504
484
),
505
485
# # Day Transform
506
486
(
521
501
VALUES
522
502
(CAST('2023-01-01' AS DATE), 'Event on 2023-01-01');
523
503
""" ,
524
- None ,
525
504
),
526
505
(
527
506
[PartitionField (source_id = 9 , field_id = 1001 , transform = DayTransform (), name = "timestamptz_field_day" )],
541
520
VALUES
542
521
(CAST('2023-01-01 12:00:01.000999+03:00' AS TIMESTAMP), 'Event at 2023-01-01 12:00:01.000999+03:00');
543
522
""" ,
544
- None ,
545
523
),
546
524
(
547
525
[PartitionField (source_id = 10 , field_id = 1001 , transform = DayTransform (), name = "date_field_day" )],
561
539
VALUES
562
540
(CAST('2023-01-01' AS DATE), 'Event on 2023-01-01');
563
541
""" ,
564
- None ,
565
542
),
566
543
# Hour Transform
567
544
(
582
559
VALUES
583
560
(CAST('2023-01-01 11:55:59.999999' AS TIMESTAMP), 'Event within the 11th hour of 2023-01-01');
584
561
""" ,
585
- None ,
586
562
),
587
563
(
588
564
[PartitionField (source_id = 9 , field_id = 1001 , transform = HourTransform (), name = "timestamptz_field_hour" )],
602
578
VALUES
603
579
(CAST('2023-01-01 12:00:01.000999+03:00' AS TIMESTAMP), 'Event at 2023-01-01 12:00:01.000999+03:00');
604
580
""" ,
605
- None ,
606
581
),
607
582
# Truncate Transform
608
583
(
623
598
VALUES
624
599
(12345, 'Sample data for int');
625
600
""" ,
626
- None ,
627
601
),
628
602
(
629
603
[PartitionField (source_id = 5 , field_id = 1001 , transform = TruncateTransform (2 ), name = "bigint_field_trunc" )],
643
617
VALUES
644
618
(4294967297, 'Sample data for long');
645
619
""" ,
646
- None ,
647
620
),
648
621
(
649
622
[PartitionField (source_id = 2 , field_id = 1001 , transform = TruncateTransform (3 ), name = "string_field_trunc" )],
663
636
VALUES
664
637
('abcdefg', 'Another sample for string');
665
638
""" ,
666
- None ,
667
639
),
668
640
(
669
641
[PartitionField (source_id = 13 , field_id = 1001 , transform = TruncateTransform (width = 5 ), name = "decimal_field_trunc" )],
683
655
VALUES
684
656
(678.90, 'Associated string value for decimal 678.90')
685
657
""" ,
686
- None ,
687
658
),
688
659
(
689
660
[PartitionField (source_id = 11 , field_id = 1001 , transform = TruncateTransform (10 ), name = "binary_field_trunc" )],
703
674
VALUES
704
675
(binary('HELLOICEBERG'), 'Sample data for binary');
705
676
""" ,
706
- None ,
707
677
),
708
678
# Bucket Transform
709
679
(
724
694
VALUES
725
695
(10, 'Integer with value 10');
726
696
""" ,
727
- None ,
728
697
),
729
698
# Test multiple field combinations could generate the Partition record and hive partition path correctly
730
699
(
753
722
VALUES
754
723
(CAST('2023-01-01 11:55:59.999999' AS TIMESTAMP), CAST('2023-01-01' AS DATE), 'some data');
755
724
""" ,
756
- None ,
757
725
),
758
726
# Test that special characters are URL-encoded
759
727
(
773
741
VALUES
774
742
('special string')
775
743
""" ,
776
- lambda name : name .replace ("#" , "_x23" ).replace ("+" , "_x2B" ),
777
744
),
778
745
],
779
746
)
@@ -787,7 +754,6 @@ def test_partition_key(
787
754
expected_hive_partition_path_slice : str ,
788
755
spark_create_table_sql_for_justification : str ,
789
756
spark_data_insert_sql_for_justification : str ,
790
- make_compatible_name : Optional [Callable [[str ], str ]],
791
757
) -> None :
792
758
partition_field_values = [PartitionFieldValue (field , value ) for field , value in zip (partition_fields , partition_values )]
793
759
spec = PartitionSpec (* partition_fields )
@@ -823,11 +789,6 @@ def test_partition_key(
823
789
snapshot .manifests (iceberg_table .io )[0 ].fetch_manifest_entry (iceberg_table .io )[0 ].data_file .file_path
824
790
)
825
791
# Special characters in partition value are sanitized when written to the data file's partition field
826
- # Use `make_compatible_name` to match the sanitize behavior
827
- sanitized_record = (
828
- Record (** {make_compatible_name (k ): v for k , v in vars (expected_partition_record ).items ()})
829
- if make_compatible_name
830
- else expected_partition_record
831
- )
792
+ sanitized_record = Record (** {make_compatible_name (k ): v for k , v in vars (expected_partition_record ).items ()})
832
793
assert spark_partition_for_justification == sanitized_record
833
794
assert expected_hive_partition_path_slice in spark_path_for_justification
0 commit comments