apache · lloyd-EA · Jan 24, 2025 · Jan 27, 2025 · Jan 27, 2025 · Jan 27, 2025
diff --git a/pyiceberg/io/pyarrow.py b/pyiceberg/io/pyarrow.py
@@ -2422,6 +2422,8 @@ def _check_pyarrow_schema_compatible(
 
 
 def parquet_files_to_data_files(io: FileIO, table_metadata: TableMetadata, file_paths: Iterator[str]) -> Iterator[DataFile]:
+    from pyiceberg.table import DOWNCAST_NS_TIMESTAMP_TO_US_ON_WRITE
+
     for file_path in file_paths:
         input_file = io.new_input(file_path)
         with input_file.open() as input_stream:
@@ -2432,7 +2434,8 @@ def parquet_files_to_data_files(io: FileIO, table_metadata: TableMetadata, file_
                 f"Cannot add file {file_path} because it has field IDs. `add_files` only supports addition of files without field_ids"
             )
         schema = table_metadata.schema()
-        _check_pyarrow_schema_compatible(schema, parquet_metadata.schema.to_arrow_schema())
+        downcast_ns_timestamp_to_us = Config().get_bool(DOWNCAST_NS_TIMESTAMP_TO_US_ON_WRITE) or False
+        _check_pyarrow_schema_compatible(schema, parquet_metadata.schema.to_arrow_schema(), downcast_ns_timestamp_to_us)
 
         statistics = data_file_statistics_from_parquet_metadata(
             parquet_metadata=parquet_metadata,

diff --git a/tests/integration/test_add_files.py b/tests/integration/test_add_files.py
@@ -597,12 +597,12 @@ def test_add_files_with_timestamp_tz_ns_fails(session_catalog: Catalog, format_v
     arrow_table = pa.Table.from_pylist(
         [
             {
-                "quux": 1615967687249846175,  # 2021-03-17 07:54:47.249846159
+                "quux": 1615967687249846175,  # 2021-03-17 07:54:47.249846175
             }
         ],
         schema=nanoseconds_schema,
     )
-    mocker.patch.dict(os.environ, values={"PYICEBERG_DOWNCAST_NS_TIMESTAMP_TO_US_ON_WRITE": "True"})
+    mocker.patch.dict(os.environ, values={"PYICEBERG_DOWNCAST_NS_TIMESTAMP_TO_US_ON_WRITE": "False"})
 
     identifier = f"default.timestamptz_ns_added{format_version}"
     tbl = _create_table(session_catalog, identifier, format_version, schema=nanoseconds_schema_iceberg)
@@ -629,6 +629,57 @@ def test_add_files_with_timestamp_tz_ns_fails(session_catalog: Catalog, format_v
     )
 
 
+@pytest.mark.integration
+def test_add_files_with_automatic_downcast_of_timestamp_to_us(
+    spark: SparkSession, session_catalog: Catalog, format_version: int, mocker: MockerFixture
+) -> None:
+    nanoseconds_schema_iceberg = Schema(NestedField(1, "quux", TimestamptzType()))
+
+    nanoseconds_schema = pa.schema(
+        [
+            ("quux", pa.timestamp("ns", tz="UTC")),
+        ]
+    )
+
+    arrow_table = pa.Table.from_pylist(
+        [
+            {
+                "quux": 1615967687249846175,  # 2021-03-17 07:54:47.249846175
+            }
+        ],
+        schema=nanoseconds_schema,
+    )
+    mocker.patch.dict(os.environ, values={"PYICEBERG_DOWNCAST_NS_TIMESTAMP_TO_US_ON_WRITE": "True"})
+
+    identifier = f"default.timestamptz_ns_added{format_version}"
+    tbl = _create_table(session_catalog, identifier, format_version, schema=nanoseconds_schema_iceberg)
+
+    file_path = f"s3://warehouse/default/test_timestamp_tz/v{format_version}/test.parquet"
+    # write parquet files
+    fo = tbl.io.new_output(file_path)
+    with fo.create(overwrite=True) as fos:
+        with pq.ParquetWriter(fos, schema=nanoseconds_schema) as writer:
+            writer.write_table(arrow_table)
+
+    # add the parquet files as data files
+    tbl.add_files(file_paths=[file_path])
+
+    # checks through pyarrow
+    data_scan = tbl.scan(selected_fields=("quux",)).to_arrow()
+    assert data_scan["quux"].type == pa.timestamp(unit="us", tz="UTC")  # timestamp unit check
+    assert data_scan["quux"][0].value == 1615967687249846  # down-casted value of the timestamp must be 'us' long
+
+    # checks through spark
+    with pytest.raises(
+        ValueError,
+        match=re.escape("year 53177 is out of range"),
+    ) as exc_info:
+        spark.sql(f"""SELECT quux FROM {identifier}""").first()
+    assert isinstance(
+        exc_info.value, ValueError
+    ), "Spark cannot downcast 'ns' to 'us' on read. This occurred due to a mismatch between the table schema ('us') and file schema ('ns' - add_files does not rewrite the data files to 'us')"
+
+
 @pytest.mark.integration
 @pytest.mark.parametrize("format_version", [1, 2])
 def test_add_file_with_valid_nullability_diff(spark: SparkSession, session_catalog: Catalog, format_version: int) -> None: