apache · Fokko · Jan 3, 2025 · Dec 30, 2024 · Jan 2, 2025 · Jan 2, 2025
diff --git a/.codespellrc b/.codespellrc
@@ -0,0 +1,18 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+[codespell]
+ignore-words-list = BoundIn,fo,MoR,NotIn,notIn,oT
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -69,6 +69,10 @@ repos:
           # --line-length is set to a high value to deal with very long lines
           - --line-length
           - '99999'
+  - repo: https://github.com/codespell-project/codespell
+    rev: v2.3.0
+    hooks:
+      - id: codespell
 ci:
     autofix_commit_msg: |
         [pre-commit.ci] auto fixes from pre-commit.com hooks

diff --git a/mkdocs/docs/api.md b/mkdocs/docs/api.md
@@ -1005,7 +1005,7 @@ tbl.add_files(file_paths=file_paths)
 
 ## Schema evolution
 
-PyIceberg supports full schema evolution through the Python API. It takes care of setting the field-IDs and makes sure that only non-breaking changes are done (can be overriden).
+PyIceberg supports full schema evolution through the Python API. It takes care of setting the field-IDs and makes sure that only non-breaking changes are done (can be overridden).
 
 In the examples below, the `.update_schema()` is called from the table itself.
 

diff --git a/mkdocs/docs/how-to-release.md b/mkdocs/docs/how-to-release.md
@@ -31,7 +31,7 @@ This guide outlines the process for releasing PyIceberg in accordance with the [
 
 * A GPG key must be registered and published in the [Apache Iceberg KEYS file](https://downloads.apache.org/iceberg/KEYS). Follow [the instructions for setting up a GPG key and uploading it to the KEYS file](#set-up-gpg-key-and-upload-to-apache-iceberg-keys-file).
 * SVN Access
-    * Permission to upload artifacts to the [Apache development distribution](https://dist.apache.org/repos/dist/dev/iceberg/) (requires Apache Commmitter access).
+    * Permission to upload artifacts to the [Apache development distribution](https://dist.apache.org/repos/dist/dev/iceberg/) (requires Apache Committer access).
     * Permission to upload artifacts to the [Apache release distribution](https://dist.apache.org/repos/dist/release/iceberg/) (requires Apache PMC access).
 * PyPI Access
     * The `twine` package must be installed for uploading releases to PyPi.

diff --git a/mkdocs/docs/verify-release.md b/mkdocs/docs/verify-release.md
@@ -111,7 +111,7 @@ To run the full test coverage, with both unit tests and integration tests:
 make test-coverage
 ```
 
-This will spin up Docker containers to faciliate running test coverage.
+This will spin up Docker containers to facilitate running test coverage.
 
 # Cast the vote
 

diff --git a/pyiceberg/avro/reader.py b/pyiceberg/avro/reader.py
@@ -51,7 +51,7 @@
 def _skip_map_array(decoder: BinaryDecoder, skip_entry: Callable[[], None]) -> None:
     """Skips over an array or map.
 
-    Both the array and map are encoded similar, and we can re-use
+    Both the array and map are encoded similar, and we can reuse
     the logic of skipping in an efficient way.
 
     From the Avro spec:

diff --git a/pyiceberg/io/pyarrow.py b/pyiceberg/io/pyarrow.py
@@ -1536,7 +1536,7 @@ def _to_requested_schema(
     include_field_ids: bool = False,
     use_large_types: bool = True,
 ) -> pa.RecordBatch:
-    # We could re-use some of these visitors
+    # We could reuse some of these visitors
     struct_array = visit_with_partner(
         requested_schema,
         batch,

diff --git a/pyiceberg/table/__init__.py b/pyiceberg/table/__init__.py
@@ -902,7 +902,7 @@ def scan(
 
         Args:
             row_filter:
-                A string or BooleanExpression that decsribes the
+                A string or BooleanExpression that describes the
                 desired rows
             selected_fields:
                 A tuple of strings representing the column names

diff --git a/pyiceberg/utils/decimal.py b/pyiceberg/utils/decimal.py
@@ -85,7 +85,7 @@ def bytes_to_decimal(value: bytes, scale: int) -> Decimal:
     """Return a decimal from the bytes.
 
     Args:
-        value (bytes): tbe bytes to be converted into a decimal.
+        value (bytes): the bytes to be converted into a decimal.
         scale (int): the scale of the decimal.
 
     Returns:

diff --git a/pyiceberg/utils/singleton.py b/pyiceberg/utils/singleton.py
@@ -15,7 +15,7 @@
 #  specific language governing permissions and limitations
 #  under the License.
 """
-This is a singleton metaclass that can be used to cache and re-use existing objects.
+This is a singleton metaclass that can be used to cache and reuse existing objects.
 
 In the Iceberg codebase we have a lot of objects that are stateless (for example Types such as StringType,
 BooleanType etc). FixedTypes have arguments (eg. Fixed[22]) that we also make part of the key when caching

diff --git a/tests/integration/test_writes/test_partitioned_writes.py b/tests/integration/test_writes/test_partitioned_writes.py
@@ -395,7 +395,7 @@ def test_dynamic_partition_overwrite_unpartitioned_evolve_to_identity_transform(
     # For a long string, the lower bound and upper bound  is truncated
     # e.g. aaaaaaaaaaaaaaaaaaaaaa has lower bound of aaaaaaaaaaaaaaaa and upper bound of aaaaaaaaaaaaaaab
     # this makes strict metric evaluator determine the file evaluate as ROWS_MIGHT_NOT_MATCH
-    # this further causes the partitioned data file to be overwriten rather than deleted
+    # this further causes the partitioned data file to be overwritten rather than deleted
     if part_col == "string_long":
         expected_operations = ["append", "append", "overwrite", "append"]
     assert tbl.inspect.snapshots().to_pydict()["operation"] == expected_operations
@@ -539,7 +539,7 @@ def test_data_files_with_table_partitioned_with_null(
     #                    the first snapshot generates M3 with 6 delete data entries collected from M1 and M2.
     #                    ML3 = [M3]
     #
-    #                    The second snapshot generates M4 with 3 appended data entries and since M3 (previous manifests) only has delte entries it does not lint to it.
+    #                    The second snapshot generates M4 with 3 appended data entries and since M3 (previous manifests) only has delete entries it does not lint to it.
     #                    ML4 = [M4]
 
     # Append           : Append generates M5 with new data entries and links to all previous manifests which is M4 .
@@ -552,7 +552,7 @@ def test_data_files_with_table_partitioned_with_null(
     #                    ML6 = [M6, M7, M8]
     #
     #                    The second snapshot generates M9 with 3 appended data entries and it also looks at manifests in ML6 (previous manifests)
-    #                    it ignores M6 since it only has delte entries but it links to M7 and M8.
+    #                    it ignores M6 since it only has delete entries but it links to M7 and M8.
     #                    ML7 = [M9, M7, M8]
 
     # tldr:

diff --git a/tests/table/test_init.py b/tests/table/test_init.py
@@ -527,7 +527,7 @@ def test_update_column(table_v1: Table, table_v2: Table) -> None:
         new_schema = table.transaction().update_schema().update_column("y", doc=COMMENT2)._apply()
         assert new_schema.find_field("y").doc == COMMENT2, "failed to update existing field doc"
 
-        # update existing doc to an emtpy string
+        # update existing doc to an empty string
         assert new_schema.find_field("y").doc == COMMENT2
         new_schema2 = table.transaction().update_schema().update_column("y", doc="")._apply()
         assert new_schema2.find_field("y").doc == "", "failed to remove existing field doc"

diff --git a/tests/test_transforms.py b/tests/test_transforms.py
@@ -899,7 +899,7 @@ def test_projection_truncate_string_set_same_result(bound_reference_str: BoundRe
 def test_projection_truncate_string_set_in(bound_reference_str: BoundReference[str]) -> None:
     assert TruncateTransform(3).project(
         "name", BoundIn(term=bound_reference_str, literals={literal("hello"), literal("world")})
-    ) == In(term="name", literals={literal("hel"), literal("wor")})
+    ) == In(term="name", literals={literal("hel"), literal("wor")})  # codespell:ignore hel
 
 
 def test_projection_truncate_string_set_not_in(bound_reference_str: BoundReference[str]) -> None: