From 0812e33744db96fa56a12e1af5d1d52f9b7da49d Mon Sep 17 00:00:00 2001 From: Felix Scherz Date: Sat, 14 Dec 2024 14:54:27 +0100 Subject: [PATCH 01/60] feat: initial setup for S3TablesCatalog --- poetry.lock | 3 +- pyiceberg/catalog/__init__.py | 4 +- pyiceberg/catalog/s3tables.py | 228 +++++++++++++++++++++++++++++++++ pyiceberg/exceptions.py | 5 + pyproject.toml | 1 + tests/catalog/test_s3tables.py | 71 ++++++++++ 6 files changed, 309 insertions(+), 3 deletions(-) create mode 100644 pyiceberg/catalog/s3tables.py create mode 100644 tests/catalog/test_s3tables.py diff --git a/poetry.lock b/poetry.lock index ae19f3a0e2..4a695895b5 100644 --- a/poetry.lock +++ b/poetry.lock @@ -5350,6 +5350,7 @@ pyiceberg-core = ["pyiceberg-core"] ray = ["pandas", "pyarrow", "ray", "ray"] rest-sigv4 = ["boto3"] s3fs = ["s3fs"] +s3tables = ["boto3"] snappy = ["python-snappy"] sql-postgres = ["psycopg2-binary", "sqlalchemy"] sql-sqlite = ["sqlalchemy"] @@ -5358,4 +5359,4 @@ zstandard = ["zstandard"] [metadata] lock-version = "2.0" python-versions = "^3.9, !=3.9.7" -content-hash = "66b432ec3645ad122e1c668f795fee1160fae2fd10012e2dfd04c04af90806ec" +content-hash = "95b2b91331fa7da405234dfb437881af27a4f0eed987628397c8b0428707636f" diff --git a/pyiceberg/catalog/__init__.py b/pyiceberg/catalog/__init__.py index c3ea23c1d2..c54ccfc47d 100644 --- a/pyiceberg/catalog/__init__.py +++ b/pyiceberg/catalog/__init__.py @@ -914,8 +914,8 @@ def _get_default_warehouse_location(self, database_name: str, table_name: str) - raise ValueError("No default path is set, please specify a location when creating a table") @staticmethod - def _write_metadata(metadata: TableMetadata, io: FileIO, metadata_path: str) -> None: - ToOutputFile.table_metadata(metadata, io.new_output(metadata_path)) + def _write_metadata(metadata: TableMetadata, io: FileIO, metadata_path: str, overwrite: bool = False) -> None: + ToOutputFile.table_metadata(metadata, io.new_output(metadata_path), overwrite=overwrite) @staticmethod def _get_metadata_location(location: str, new_version: int = 0) -> str: diff --git a/pyiceberg/catalog/s3tables.py b/pyiceberg/catalog/s3tables.py new file mode 100644 index 0000000000..82cd862571 --- /dev/null +++ b/pyiceberg/catalog/s3tables.py @@ -0,0 +1,228 @@ +import re +from typing import TYPE_CHECKING +from typing import List +from typing import Optional +from typing import Set +from typing import Tuple +from typing import Union + +import boto3 + +from pyiceberg.catalog import DEPRECATED_BOTOCORE_SESSION, MetastoreCatalog +from pyiceberg.catalog import WAREHOUSE_LOCATION +from pyiceberg.catalog import Catalog +from pyiceberg.catalog import PropertiesUpdateSummary +from pyiceberg.exceptions import S3TablesError +from pyiceberg.exceptions import TableBucketNotFound +from pyiceberg.io import AWS_ACCESS_KEY_ID +from pyiceberg.io import AWS_REGION +from pyiceberg.io import AWS_SECRET_ACCESS_KEY +from pyiceberg.io import AWS_SESSION_TOKEN +from pyiceberg.io import load_file_io +from pyiceberg.partitioning import UNPARTITIONED_PARTITION_SPEC, PartitionSpec +from pyiceberg.schema import Schema +from pyiceberg.serializers import FromInputFile +from pyiceberg.table import CommitTableResponse +from pyiceberg.table import CreateTableTransaction +from pyiceberg.table import Table +from pyiceberg.table import TableRequirement +from pyiceberg.table.metadata import new_table_metadata +from pyiceberg.table.sorting import UNSORTED_SORT_ORDER, SortOrder +from pyiceberg.table.update import TableUpdate +from pyiceberg.typedef import EMPTY_DICT, Identifier +from pyiceberg.typedef import Properties +from pyiceberg.utils.properties import get_first_property_value + +if TYPE_CHECKING: + import pyarrow as pa + +S3TABLES_PROFILE_NAME = "s3tables.profile-name" +S3TABLES_REGION = "s3tables.region" +S3TABLES_ACCESS_KEY_ID = "s3tables.access-key-id" +S3TABLES_SECRET_ACCESS_KEY = "s3tables.secret-access-key" +S3TABLES_SESSION_TOKEN = "s3tables.session-token" + +S3TABLES_ENDPOINT = "s3tables.endpoint" + + +class S3TableCatalog(MetastoreCatalog): + def __init__(self, name: str, **properties: str): + super().__init__(name, **properties) + + session = boto3.Session( + profile_name=properties.get(S3TABLES_PROFILE_NAME), + region_name=get_first_property_value(properties, S3TABLES_REGION, AWS_REGION), + botocore_session=properties.get(DEPRECATED_BOTOCORE_SESSION), + aws_access_key_id=get_first_property_value(properties, S3TABLES_ACCESS_KEY_ID, AWS_ACCESS_KEY_ID), + aws_secret_access_key=get_first_property_value( + properties, S3TABLES_SECRET_ACCESS_KEY, AWS_SECRET_ACCESS_KEY + ), + aws_session_token=get_first_property_value(properties, S3TABLES_SESSION_TOKEN, AWS_SESSION_TOKEN), + ) + # TODO: s3tables client only supported from boto3>=1.35.74 so this can crash + # TODO: set a custom user-agent for api calls like the Java implementation + self.s3tables = session.client("s3tables") + # TODO: handle malformed properties instead of just raising a key error here + self.table_bucket_arn = self.properties[WAREHOUSE_LOCATION] + try: + self.s3tables.get_table_bucket(tableBucketARN=self.table_bucket_arn) + except self.s3tables.exceptions.NotFoundException as e: + raise TableBucketNotFound(e) from e + + def commit_table( + self, table: Table, requirements: Tuple[TableRequirement, ...], updates: Tuple[TableUpdate, ...] + ) -> CommitTableResponse: + return super().commit_table(table, requirements, updates) + + def create_namespace(self, namespace: Union[str, Identifier], properties: Properties = ...) -> None: + valid_namespace: str = self._validate_namespace_identifier(namespace) + self.s3tables.create_namespace(tableBucketARN=self.table_bucket_arn, namespace=[valid_namespace]) + + def _validate_namespace_identifier(self, namespace: Union[str, Identifier]) -> str: + # for naming rules see: https://docs.aws.amazon.com/AmazonS3/latest/userguide/s3-tables-buckets-naming.html + # TODO: extract into constant variables + pattern = re.compile("[a-z0-9][a-z0-9_]{2,62}") + reserved = "aws_s3_metadata" + + namespace = self.identifier_to_database(namespace) + + if not pattern.fullmatch(namespace): + ... + + if namespace == reserved: + ... + + return namespace + + def _validate_database_and_table_identifier(self, identifier: Union[str, Identifier]) -> Tuple[str, str]: + # for naming rules see: https://docs.aws.amazon.com/AmazonS3/latest/userguide/s3-tables-buckets-naming.html + # TODO: extract into constant variables + pattern = re.compile("[a-z0-9][a-z0-9_]{2,62}") + + namespace, table_name = self.identifier_to_database_and_table(identifier) + + namespace = self._validate_namespace_identifier(namespace) + + if not pattern.fullmatch(table_name): + ... + + return namespace, table_name + + def create_table( + self, + identifier: Union[str, Identifier], + schema: Union[Schema, "pa.Schema"], + location: Optional[str] = None, + partition_spec: PartitionSpec = UNPARTITIONED_PARTITION_SPEC, + sort_order: SortOrder = UNSORTED_SORT_ORDER, + properties: Properties = EMPTY_DICT, + ) -> Table: + namespace, table_name = self._validate_database_and_table_identifier(identifier) + + schema: Schema = self._convert_schema_if_needed(schema) # type: ignore + + # TODO: check whether namespace exists and if it does, whether table_name already exists + self.s3tables.create_table( + tableBucketARN=self.table_bucket_arn, namespace=namespace, name=table_name, format="ICEBERG" + ) + + # location is given by s3 table bucket + response = self.s3tables.get_table(tableBucketARN=self.table_bucket_arn, namespace=namespace, name=table_name) + version_token = response["versionToken"] + + location = response["warehouseLocation"] + metadata_location = self._get_metadata_location(location=location) + metadata = new_table_metadata( + location=location, + schema=schema, + partition_spec=partition_spec, + sort_order=sort_order, + properties=properties, + ) + + io = load_file_io(properties=self.properties, location=metadata_location) + # TODO: this triggers unsupported list operation error, setting overwrite=True is a workaround for now + # TODO: we can perform this check manually maybe? + self._write_metadata(metadata, io, metadata_location, overwrite=True) + # TODO: after writing need to update table metadata location + # can this operation fail if the version token does not match? + self.s3tables.update_table_metadata_location( + tableBucketARN=self.table_bucket_arn, + namespace=namespace, + name=table_name, + versionToken=version_token, + metadataLocation=metadata_location, + ) + + return self.load_table(identifier=identifier) + + def create_table_transaction( + self, + identifier: Union[str, Identifier], + schema: Union[Schema, "pa.Schema"], + location: Optional[str] = None, + partition_spec: PartitionSpec = ..., + sort_order: SortOrder = ..., + properties: Properties = ..., + ) -> CreateTableTransaction: + return super().create_table_transaction(identifier, schema, location, partition_spec, sort_order, properties) + + def drop_namespace(self, namespace: Union[str, Identifier]) -> None: + return super().drop_namespace(namespace) + + def drop_table(self, identifier: Union[str, Identifier]) -> None: + return super().drop_table(identifier) + + def drop_view(self, identifier: Union[str, Identifier]) -> None: + return super().drop_view(identifier) + + def list_namespaces(self, namespace: Union[str, Identifier] = ...) -> List[Identifier]: + # TODO: handle pagination + response = self.s3tables.list_namespaces(tableBucketARN=self.table_bucket_arn) + return [tuple(namespace["namespace"]) for namespace in response["namespaces"]] + + def list_tables(self, namespace: Union[str, Identifier]) -> List[Identifier]: + return super().list_tables(namespace) + + def list_views(self, namespace: Union[str, Identifier]) -> List[Identifier]: + return super().list_views(namespace) + + def load_namespace_properties(self, namespace: Union[str, Identifier]) -> Properties: + return super().load_namespace_properties(namespace) + + def load_table(self, identifier: Union[str, Identifier]) -> Table: + namespace, table_name = self._validate_database_and_table_identifier(identifier) + # TODO: raise a NoSuchTableError if it does not exist + response = self.s3tables.get_table_metadata_location( + tableBucketARN=self.table_bucket_arn, namespace=namespace, name=table_name + ) + # TODO: we might need to catch if table is not initialized i.e. does not have metadata setup yet + metadata_location = response["metadataLocation"] + + io = load_file_io(properties=self.properties, location=metadata_location) + file = io.new_input(metadata_location) + metadata = FromInputFile.table_metadata(file) + return Table( + identifier=(namespace, table_name), + metadata=metadata, + metadata_location=metadata_location, + io=self._load_file_io(metadata.properties, metadata_location), + catalog=self, + ) + + def purge_table(self, identifier: Union[str, Identifier]) -> None: + return super().purge_table(identifier) + + def register_table(self, identifier: Union[str, Identifier], metadata_location: str) -> Table: + return super().register_table(identifier, metadata_location) + + def rename_table(self, from_identifier: Union[str, Identifier], to_identifier: Union[str, Identifier]) -> Table: + return super().rename_table(from_identifier, to_identifier) + + def table_exists(self, identifier: Union[str, Identifier]) -> bool: + return super().table_exists(identifier) + + def update_namespace_properties( + self, namespace: Union[str, Identifier], removals: Optional[Set[str]] = None, updates: Properties = ... + ) -> PropertiesUpdateSummary: + return super().update_namespace_properties(namespace, removals, updates) diff --git a/pyiceberg/exceptions.py b/pyiceberg/exceptions.py index 56574ff471..6096096f45 100644 --- a/pyiceberg/exceptions.py +++ b/pyiceberg/exceptions.py @@ -111,6 +111,11 @@ class ConditionalCheckFailedException(DynamoDbError): class GenericDynamoDbError(DynamoDbError): pass +class S3TablesError(Exception): + pass + +class TableBucketNotFound(S3TablesError): + pass class CommitFailedException(Exception): """Commit failed, refresh and try again.""" diff --git a/pyproject.toml b/pyproject.toml index 7d28e89832..dfa6f286c7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1211,6 +1211,7 @@ sql-postgres = ["sqlalchemy", "psycopg2-binary"] sql-sqlite = ["sqlalchemy"] gcsfs = ["gcsfs"] rest-sigv4 = ["boto3"] +s3tables = ["boto3"] pyiceberg-core = ["pyiceberg-core"] [tool.pytest.ini_options] diff --git a/tests/catalog/test_s3tables.py b/tests/catalog/test_s3tables.py new file mode 100644 index 0000000000..220b0ad507 --- /dev/null +++ b/tests/catalog/test_s3tables.py @@ -0,0 +1,71 @@ +import boto3 +from pyiceberg.schema import Schema +import pytest + +from pyiceberg.catalog.s3tables import S3TableCatalog +from pyiceberg.exceptions import TableBucketNotFound + + +@pytest.fixture +def database_name(database_name): + # naming rules prevent "-" in namespaces for s3 table buckets + return database_name.replace("-", "_") + + +@pytest.fixture +def table_name(table_name): + # naming rules prevent "-" in table namees for s3 table buckets + return table_name.replace("-", "_") + +@pytest.fixture +def table_bucket_arn(): + import os + # since the moto library does not support s3tables as of 2024-12-14 we have to test against a real AWS endpoint + # in one of the supported regions. + + return os.environ["ARN"] + + +def test_s3tables_boto_api(table_bucket_arn): + client = boto3.client("s3tables") + response = client.list_namespaces(tableBucketARN=table_bucket_arn) + print(response["namespaces"]) + + response = client.get_table_bucket(tableBucketARN=table_bucket_arn + "abc") + print(response) + + + +def test_s3tables_namespaces_api(table_bucket_arn): + client = boto3.client("s3tables") + response = client.create_namespace(tableBucketARN=table_bucket_arn, namespace=["one", "two"]) + print(response) + response = client.list_namespaces(tableBucketARN=table_bucket_arn) + print(response) + +def test_creating_catalog_validates_s3_table_bucket_exists(table_bucket_arn): + properties = {"warehouse": f"{table_bucket_arn}-modified"} + with pytest.raises(TableBucketNotFound): + S3TableCatalog(name="test_s3tables_catalog", **properties) + + +def test_create_namespace(table_bucket_arn, database_name: str): + properties = {"warehouse": table_bucket_arn} + catalog = S3TableCatalog(name="test_s3tables_catalog", **properties) + catalog.create_namespace(namespace=database_name) + namespaces = catalog.list_namespaces() + assert (database_name,) in namespaces + + +def test_create_table(table_bucket_arn, database_name: str, table_name:str, table_schema_nested: Schema): + properties = {"warehouse": table_bucket_arn} + catalog = S3TableCatalog(name="test_s3tables_catalog", **properties) + identifier = (database_name, table_name) + + catalog.create_namespace(namespace=database_name) + print(database_name, table_name) + # this fails with + # OSError: When completing multiple part upload for key 'metadata/00000-55a9c37c-b822-4a81-ac0e-1efbcd145dba.metadata.json' in bucket '14e4e036-d4ae-44f8-koana45eruw + # Uunable to parse ExceptionName: S3TablesUnsupportedHeader Message: S3 Tables does not support the following header: x-amz-api-version value: 2006-03-01 + table = catalog.create_table(identifier=identifier, schema=table_schema_nested) + print(table) From ba172bbb643bad23b1af7bb58a9324299f8ab4b1 Mon Sep 17 00:00:00 2001 From: Felix Scherz Date: Sat, 14 Dec 2024 17:41:40 +0100 Subject: [PATCH 02/60] feat: support create_table using FsspecFileIO --- pyiceberg/catalog/s3tables.py | 7 ++++++- tests/catalog/test_s3tables.py | 27 ++++----------------------- 2 files changed, 10 insertions(+), 24 deletions(-) diff --git a/pyiceberg/catalog/s3tables.py b/pyiceberg/catalog/s3tables.py index 82cd862571..740e72a124 100644 --- a/pyiceberg/catalog/s3tables.py +++ b/pyiceberg/catalog/s3tables.py @@ -14,7 +14,7 @@ from pyiceberg.catalog import PropertiesUpdateSummary from pyiceberg.exceptions import S3TablesError from pyiceberg.exceptions import TableBucketNotFound -from pyiceberg.io import AWS_ACCESS_KEY_ID +from pyiceberg.io import AWS_ACCESS_KEY_ID, PY_IO_IMPL from pyiceberg.io import AWS_REGION from pyiceberg.io import AWS_SECRET_ACCESS_KEY from pyiceberg.io import AWS_SESSION_TOKEN @@ -44,10 +44,15 @@ S3TABLES_ENDPOINT = "s3tables.endpoint" +# pyarrow does not support writing to S3 Table buckets as of 2024-12-14 https://github.com/apache/iceberg-python/issues/1404#issuecomment-2543174146 +S3TABLES_FILE_IO_DEFAULT = "pyiceberg.io.fsspec.FsspecFileIO" + class S3TableCatalog(MetastoreCatalog): def __init__(self, name: str, **properties: str): super().__init__(name, **properties) + # TODO: implement a proper check for FileIO + self.properties[PY_IO_IMPL] = S3TABLES_FILE_IO_DEFAULT session = boto3.Session( profile_name=properties.get(S3TABLES_PROFILE_NAME), diff --git a/tests/catalog/test_s3tables.py b/tests/catalog/test_s3tables.py index 220b0ad507..855b144a14 100644 --- a/tests/catalog/test_s3tables.py +++ b/tests/catalog/test_s3tables.py @@ -26,23 +26,6 @@ def table_bucket_arn(): return os.environ["ARN"] -def test_s3tables_boto_api(table_bucket_arn): - client = boto3.client("s3tables") - response = client.list_namespaces(tableBucketARN=table_bucket_arn) - print(response["namespaces"]) - - response = client.get_table_bucket(tableBucketARN=table_bucket_arn + "abc") - print(response) - - - -def test_s3tables_namespaces_api(table_bucket_arn): - client = boto3.client("s3tables") - response = client.create_namespace(tableBucketARN=table_bucket_arn, namespace=["one", "two"]) - print(response) - response = client.list_namespaces(tableBucketARN=table_bucket_arn) - print(response) - def test_creating_catalog_validates_s3_table_bucket_exists(table_bucket_arn): properties = {"warehouse": f"{table_bucket_arn}-modified"} with pytest.raises(TableBucketNotFound): @@ -58,14 +41,12 @@ def test_create_namespace(table_bucket_arn, database_name: str): def test_create_table(table_bucket_arn, database_name: str, table_name:str, table_schema_nested: Schema): - properties = {"warehouse": table_bucket_arn} + # setting FileIO to FsspecFileIO explicitly is required as pyarrwo does not work with S3 Table Buckets yet + properties = {"warehouse": table_bucket_arn, "py-io-impl": "pyiceberg.io.fsspec.FsspecFileIO"} catalog = S3TableCatalog(name="test_s3tables_catalog", **properties) identifier = (database_name, table_name) catalog.create_namespace(namespace=database_name) - print(database_name, table_name) - # this fails with - # OSError: When completing multiple part upload for key 'metadata/00000-55a9c37c-b822-4a81-ac0e-1efbcd145dba.metadata.json' in bucket '14e4e036-d4ae-44f8-koana45eruw - # Uunable to parse ExceptionName: S3TablesUnsupportedHeader Message: S3 Tables does not support the following header: x-amz-api-version value: 2006-03-01 table = catalog.create_table(identifier=identifier, schema=table_schema_nested) - print(table) + + assert table == catalog.load_table(identifier) From db70192bf8a2374ac8c88c475ff208f5fae42055 Mon Sep 17 00:00:00 2001 From: Felix Scherz Date: Sat, 14 Dec 2024 18:13:38 +0100 Subject: [PATCH 03/60] feat: implement drop_table --- pyiceberg/catalog/s3tables.py | 38 ++++++++++++++++++++++++++-------- tests/catalog/test_s3tables.py | 18 +++++++++++++++- 2 files changed, 46 insertions(+), 10 deletions(-) diff --git a/pyiceberg/catalog/s3tables.py b/pyiceberg/catalog/s3tables.py index 740e72a124..7234e7223a 100644 --- a/pyiceberg/catalog/s3tables.py +++ b/pyiceberg/catalog/s3tables.py @@ -8,18 +8,22 @@ import boto3 -from pyiceberg.catalog import DEPRECATED_BOTOCORE_SESSION, MetastoreCatalog +from pyiceberg.catalog import DEPRECATED_BOTOCORE_SESSION from pyiceberg.catalog import WAREHOUSE_LOCATION from pyiceberg.catalog import Catalog +from pyiceberg.catalog import MetastoreCatalog from pyiceberg.catalog import PropertiesUpdateSummary +from pyiceberg.exceptions import NoSuchTableError from pyiceberg.exceptions import S3TablesError from pyiceberg.exceptions import TableBucketNotFound -from pyiceberg.io import AWS_ACCESS_KEY_ID, PY_IO_IMPL +from pyiceberg.io import AWS_ACCESS_KEY_ID from pyiceberg.io import AWS_REGION from pyiceberg.io import AWS_SECRET_ACCESS_KEY from pyiceberg.io import AWS_SESSION_TOKEN +from pyiceberg.io import PY_IO_IMPL from pyiceberg.io import load_file_io -from pyiceberg.partitioning import UNPARTITIONED_PARTITION_SPEC, PartitionSpec +from pyiceberg.partitioning import UNPARTITIONED_PARTITION_SPEC +from pyiceberg.partitioning import PartitionSpec from pyiceberg.schema import Schema from pyiceberg.serializers import FromInputFile from pyiceberg.table import CommitTableResponse @@ -27,9 +31,11 @@ from pyiceberg.table import Table from pyiceberg.table import TableRequirement from pyiceberg.table.metadata import new_table_metadata -from pyiceberg.table.sorting import UNSORTED_SORT_ORDER, SortOrder +from pyiceberg.table.sorting import UNSORTED_SORT_ORDER +from pyiceberg.table.sorting import SortOrder from pyiceberg.table.update import TableUpdate -from pyiceberg.typedef import EMPTY_DICT, Identifier +from pyiceberg.typedef import EMPTY_DICT +from pyiceberg.typedef import Identifier from pyiceberg.typedef import Properties from pyiceberg.utils.properties import get_first_property_value @@ -176,7 +182,18 @@ def drop_namespace(self, namespace: Union[str, Identifier]) -> None: return super().drop_namespace(namespace) def drop_table(self, identifier: Union[str, Identifier]) -> None: - return super().drop_table(identifier) + namespace, table_name = self._validate_database_and_table_identifier(identifier) + try: + response = self.s3tables.get_table( + tableBucketARN=self.table_bucket_arn, namespace=namespace, name=table_name + ) + except self.s3tables.exceptions.NotFoundException as e: + raise NoSuchTableError(f"No table with identifier {identifier} exists.") from e + + # TODO: handle conflicts due to versionToken mismatch that might occur + self.s3tables.delete_table( + tableBucketARN=self.table_bucket_arn, namespace=namespace, name=table_name, versionToken=response["versionToken"] + ) def drop_view(self, identifier: Union[str, Identifier]) -> None: return super().drop_view(identifier) @@ -198,9 +215,12 @@ def load_namespace_properties(self, namespace: Union[str, Identifier]) -> Proper def load_table(self, identifier: Union[str, Identifier]) -> Table: namespace, table_name = self._validate_database_and_table_identifier(identifier) # TODO: raise a NoSuchTableError if it does not exist - response = self.s3tables.get_table_metadata_location( - tableBucketARN=self.table_bucket_arn, namespace=namespace, name=table_name - ) + try: + response = self.s3tables.get_table_metadata_location( + tableBucketARN=self.table_bucket_arn, namespace=namespace, name=table_name + ) + except self.s3tables.exceptions.NotFoundException as e: + raise NoSuchTableError(f"No table with identifier {identifier} exists.") from e # TODO: we might need to catch if table is not initialized i.e. does not have metadata setup yet metadata_location = response["metadataLocation"] diff --git a/tests/catalog/test_s3tables.py b/tests/catalog/test_s3tables.py index 855b144a14..9a68cdb794 100644 --- a/tests/catalog/test_s3tables.py +++ b/tests/catalog/test_s3tables.py @@ -3,7 +3,7 @@ import pytest from pyiceberg.catalog.s3tables import S3TableCatalog -from pyiceberg.exceptions import TableBucketNotFound +from pyiceberg.exceptions import NoSuchTableError, TableBucketNotFound @pytest.fixture @@ -50,3 +50,19 @@ def test_create_table(table_bucket_arn, database_name: str, table_name:str, tabl table = catalog.create_table(identifier=identifier, schema=table_schema_nested) assert table == catalog.load_table(identifier) + + + +def test_drop_table(table_bucket_arn, database_name: str, table_name:str, table_schema_nested: Schema): + # setting FileIO to FsspecFileIO explicitly is required as pyarrwo does not work with S3 Table Buckets yet + properties = {"warehouse": table_bucket_arn, "py-io-impl": "pyiceberg.io.fsspec.FsspecFileIO"} + catalog = S3TableCatalog(name="test_s3tables_catalog", **properties) + identifier = (database_name, table_name) + + catalog.create_namespace(namespace=database_name) + catalog.create_table(identifier=identifier, schema=table_schema_nested) + + catalog.drop_table(identifier=identifier) + + with pytest.raises(NoSuchTableError): + catalog.load_table(identifier=identifier) From 324de35c92a7fdd4dd45a7b9c0a374b29a531866 Mon Sep 17 00:00:00 2001 From: Felix Scherz Date: Sat, 14 Dec 2024 18:20:02 +0100 Subject: [PATCH 04/60] feat: implement drop_namespace --- pyiceberg/catalog/s3tables.py | 9 +++++++-- tests/catalog/test_s3tables.py | 9 +++++++++ 2 files changed, 16 insertions(+), 2 deletions(-) diff --git a/pyiceberg/catalog/s3tables.py b/pyiceberg/catalog/s3tables.py index 7234e7223a..fd7d7addc0 100644 --- a/pyiceberg/catalog/s3tables.py +++ b/pyiceberg/catalog/s3tables.py @@ -13,7 +13,7 @@ from pyiceberg.catalog import Catalog from pyiceberg.catalog import MetastoreCatalog from pyiceberg.catalog import PropertiesUpdateSummary -from pyiceberg.exceptions import NoSuchTableError +from pyiceberg.exceptions import NamespaceNotEmptyError, NoSuchTableError from pyiceberg.exceptions import S3TablesError from pyiceberg.exceptions import TableBucketNotFound from pyiceberg.io import AWS_ACCESS_KEY_ID @@ -179,7 +179,12 @@ def create_table_transaction( return super().create_table_transaction(identifier, schema, location, partition_spec, sort_order, properties) def drop_namespace(self, namespace: Union[str, Identifier]) -> None: - return super().drop_namespace(namespace) + namespace = self._validate_namespace_identifier(namespace) + try: + self.s3tables.delete_namespace(tableBucketARN=self.table_bucket_arn, namespace=namespace) + except self.s3tables.exceptions.ConflictException as e: + raise NamespaceNotEmptyError(f"Namespace {namespace} is not empty.") from e + def drop_table(self, identifier: Union[str, Identifier]) -> None: namespace, table_name = self._validate_database_and_table_identifier(identifier) diff --git a/tests/catalog/test_s3tables.py b/tests/catalog/test_s3tables.py index 9a68cdb794..0503fff9c1 100644 --- a/tests/catalog/test_s3tables.py +++ b/tests/catalog/test_s3tables.py @@ -39,6 +39,15 @@ def test_create_namespace(table_bucket_arn, database_name: str): namespaces = catalog.list_namespaces() assert (database_name,) in namespaces +def test_drop_namespace(table_bucket_arn, database_name: str): + properties = {"warehouse": table_bucket_arn} + catalog = S3TableCatalog(name="test_s3tables_catalog", **properties) + catalog.create_namespace(namespace=database_name) + assert (database_name,) in catalog.list_namespaces() + catalog.drop_namespace(namespace=database_name) + assert (database_name,) not in catalog.list_namespaces() + + def test_create_table(table_bucket_arn, database_name: str, table_name:str, table_schema_nested: Schema): # setting FileIO to FsspecFileIO explicitly is required as pyarrwo does not work with S3 Table Buckets yet From 3c8fe2ccef1156e90acfd8732dedb4932918fdd9 Mon Sep 17 00:00:00 2001 From: Felix Scherz Date: Sun, 15 Dec 2024 12:01:36 +0100 Subject: [PATCH 05/60] test: validate how version conflict is handled with s3tables API --- tests/catalog/test_s3tables.py | 40 +++++++++++++++++++++++++++++----- 1 file changed, 34 insertions(+), 6 deletions(-) diff --git a/tests/catalog/test_s3tables.py b/tests/catalog/test_s3tables.py index 0503fff9c1..c7b9d13df8 100644 --- a/tests/catalog/test_s3tables.py +++ b/tests/catalog/test_s3tables.py @@ -1,9 +1,12 @@ +import uuid + import boto3 -from pyiceberg.schema import Schema import pytest from pyiceberg.catalog.s3tables import S3TableCatalog -from pyiceberg.exceptions import NoSuchTableError, TableBucketNotFound +from pyiceberg.exceptions import NoSuchTableError +from pyiceberg.exceptions import TableBucketNotFound +from pyiceberg.schema import Schema @pytest.fixture @@ -17,15 +20,41 @@ def table_name(table_name): # naming rules prevent "-" in table namees for s3 table buckets return table_name.replace("-", "_") + @pytest.fixture def table_bucket_arn(): import os + # since the moto library does not support s3tables as of 2024-12-14 we have to test against a real AWS endpoint # in one of the supported regions. return os.environ["ARN"] +def test_s3tables_api_raises_on_conflicting_version_tokens(table_bucket_arn, database_name, table_name): + client = boto3.client("s3tables") + client.create_namespace(tableBucketARN=table_bucket_arn, namespace=[database_name]) + response = client.create_table( + tableBucketARN=table_bucket_arn, namespace=database_name, name=table_name, format="ICEBERG" + ) + version_token = response["versionToken"] + scrambled_version_token = version_token[::-1] + + warehouse_location = client.get_table(tableBucketARN=table_bucket_arn, namespace=database_name, name=table_name)[ + "warehouseLocation" + ] + metadata_location = f"{warehouse_location}/metadata/00001-{uuid.uuid4()}.metadata.json" + + with pytest.raises(client.exceptions.ConflictException): + client.update_table_metadata_location( + tableBucketARN=table_bucket_arn, + namespace=database_name, + name=table_name, + versionToken=scrambled_version_token, + metadataLocation=metadata_location, + ) + + def test_creating_catalog_validates_s3_table_bucket_exists(table_bucket_arn): properties = {"warehouse": f"{table_bucket_arn}-modified"} with pytest.raises(TableBucketNotFound): @@ -39,6 +68,7 @@ def test_create_namespace(table_bucket_arn, database_name: str): namespaces = catalog.list_namespaces() assert (database_name,) in namespaces + def test_drop_namespace(table_bucket_arn, database_name: str): properties = {"warehouse": table_bucket_arn} catalog = S3TableCatalog(name="test_s3tables_catalog", **properties) @@ -48,8 +78,7 @@ def test_drop_namespace(table_bucket_arn, database_name: str): assert (database_name,) not in catalog.list_namespaces() - -def test_create_table(table_bucket_arn, database_name: str, table_name:str, table_schema_nested: Schema): +def test_create_table(table_bucket_arn, database_name: str, table_name: str, table_schema_nested: Schema): # setting FileIO to FsspecFileIO explicitly is required as pyarrwo does not work with S3 Table Buckets yet properties = {"warehouse": table_bucket_arn, "py-io-impl": "pyiceberg.io.fsspec.FsspecFileIO"} catalog = S3TableCatalog(name="test_s3tables_catalog", **properties) @@ -61,8 +90,7 @@ def test_create_table(table_bucket_arn, database_name: str, table_name:str, tabl assert table == catalog.load_table(identifier) - -def test_drop_table(table_bucket_arn, database_name: str, table_name:str, table_schema_nested: Schema): +def test_drop_table(table_bucket_arn, database_name: str, table_name: str, table_schema_nested: Schema): # setting FileIO to FsspecFileIO explicitly is required as pyarrwo does not work with S3 Table Buckets yet properties = {"warehouse": table_bucket_arn, "py-io-impl": "pyiceberg.io.fsspec.FsspecFileIO"} catalog = S3TableCatalog(name="test_s3tables_catalog", **properties) From e1772f76ab41440ba797137396c6010a8255205e Mon Sep 17 00:00:00 2001 From: Felix Scherz Date: Sun, 15 Dec 2024 13:40:23 +0100 Subject: [PATCH 06/60] feat: implement commit_table --- pyiceberg/catalog/s3tables.py | 52 +++++++++++++++++++++++++++++++--- tests/catalog/test_s3tables.py | 30 ++++++++++++++++++++ 2 files changed, 78 insertions(+), 4 deletions(-) diff --git a/pyiceberg/catalog/s3tables.py b/pyiceberg/catalog/s3tables.py index fd7d7addc0..1fb384b21d 100644 --- a/pyiceberg/catalog/s3tables.py +++ b/pyiceberg/catalog/s3tables.py @@ -13,7 +13,9 @@ from pyiceberg.catalog import Catalog from pyiceberg.catalog import MetastoreCatalog from pyiceberg.catalog import PropertiesUpdateSummary -from pyiceberg.exceptions import NamespaceNotEmptyError, NoSuchTableError +from pyiceberg.exceptions import CommitFailedException +from pyiceberg.exceptions import NamespaceNotEmptyError +from pyiceberg.exceptions import NoSuchTableError from pyiceberg.exceptions import S3TablesError from pyiceberg.exceptions import TableBucketNotFound from pyiceberg.io import AWS_ACCESS_KEY_ID @@ -83,7 +85,47 @@ def __init__(self, name: str, **properties: str): def commit_table( self, table: Table, requirements: Tuple[TableRequirement, ...], updates: Tuple[TableUpdate, ...] ) -> CommitTableResponse: - return super().commit_table(table, requirements, updates) + table_identifier = table.name() + database_name, table_name = self.identifier_to_database_and_table(table_identifier, NoSuchTableError) + + # TODO: loading table and getting versionToken should be an atomic operation, otherwise + # the table can change between those two API calls without noticing + # -> change this into an internal API that returns table information along with versionToken + current_table = self.load_table(identifier=table_identifier) + version_token = self.s3tables.get_table( + tableBucketARN=self.table_bucket_arn, namespace=database_name, name=table_name + )["versionToken"] + + updated_staged_table = self._update_and_stage_table(current_table, table_identifier, requirements, updates) + if current_table and updated_staged_table.metadata == current_table.metadata: + # no changes, do nothing + return CommitTableResponse( + metadata=current_table.metadata, metadata_location=current_table.metadata_location + ) + + self._write_metadata( + metadata=updated_staged_table.metadata, + io=updated_staged_table.io, + metadata_path=updated_staged_table.metadata_location, + overwrite=True, + ) + + # try to update metadata location which will fail if the versionToken changed meanwhile + try: + self.s3tables.update_table_metadata_location( + tableBucketARN=self.table_bucket_arn, + namespace=database_name, + name=table_name, + versionToken=version_token, + metadataLocation=updated_staged_table.metadata_location, + ) + except self.s3tables.exceptions.ConflictException as e: + raise CommitFailedException( + f"Cannot commit {database_name}.{table_name} because of a concurrent update to the table version {version_token}." + ) from e + return CommitTableResponse( + metadata=updated_staged_table.metadata, metadata_location=updated_staged_table.metadata_location + ) def create_namespace(self, namespace: Union[str, Identifier], properties: Properties = ...) -> None: valid_namespace: str = self._validate_namespace_identifier(namespace) @@ -185,7 +227,6 @@ def drop_namespace(self, namespace: Union[str, Identifier]) -> None: except self.s3tables.exceptions.ConflictException as e: raise NamespaceNotEmptyError(f"Namespace {namespace} is not empty.") from e - def drop_table(self, identifier: Union[str, Identifier]) -> None: namespace, table_name = self._validate_database_and_table_identifier(identifier) try: @@ -197,7 +238,10 @@ def drop_table(self, identifier: Union[str, Identifier]) -> None: # TODO: handle conflicts due to versionToken mismatch that might occur self.s3tables.delete_table( - tableBucketARN=self.table_bucket_arn, namespace=namespace, name=table_name, versionToken=response["versionToken"] + tableBucketARN=self.table_bucket_arn, + namespace=namespace, + name=table_name, + versionToken=response["versionToken"], ) def drop_view(self, identifier: Union[str, Identifier]) -> None: diff --git a/tests/catalog/test_s3tables.py b/tests/catalog/test_s3tables.py index c7b9d13df8..d185bd795d 100644 --- a/tests/catalog/test_s3tables.py +++ b/tests/catalog/test_s3tables.py @@ -7,6 +7,7 @@ from pyiceberg.exceptions import NoSuchTableError from pyiceberg.exceptions import TableBucketNotFound from pyiceberg.schema import Schema +from pyiceberg.types import IntegerType @pytest.fixture @@ -103,3 +104,32 @@ def test_drop_table(table_bucket_arn, database_name: str, table_name: str, table with pytest.raises(NoSuchTableError): catalog.load_table(identifier=identifier) + + +def test_commit_table(table_bucket_arn, database_name: str, table_name: str, table_schema_nested: Schema): + # setting FileIO to FsspecFileIO explicitly is required as pyarrwo does not work with S3 Table Buckets yet + properties = {"warehouse": table_bucket_arn, "py-io-impl": "pyiceberg.io.fsspec.FsspecFileIO"} + catalog = S3TableCatalog(name="test_s3tables_catalog", **properties) + identifier = (database_name, table_name) + + catalog.create_namespace(namespace=database_name) + table = catalog.create_table(identifier=identifier, schema=table_schema_nested) + + last_updated_ms = table.metadata.last_updated_ms + original_table_metadata_location = table.metadata_location + original_table_last_updated_ms = table.metadata.last_updated_ms + + + + transaction = table.transaction() + update = transaction.update_schema() + update.add_column(path="b", field_type=IntegerType()) + update.commit() + transaction.commit_transaction() + + updated_table_metadata = table.metadata + assert updated_table_metadata.current_schema_id == 1 + assert len(updated_table_metadata.schemas) == 2 + assert updated_table_metadata.last_updated_ms > last_updated_ms + assert updated_table_metadata.metadata_log[0].metadata_file == original_table_metadata_location + assert updated_table_metadata.metadata_log[0].timestamp_ms == original_table_last_updated_ms From ec53f200f5038ea1ce8dc41615f3392264cb5fff Mon Sep 17 00:00:00 2001 From: Felix Scherz Date: Wed, 18 Dec 2024 21:37:10 +0100 Subject: [PATCH 07/60] feat: implement table_exists --- pyiceberg/catalog/s3tables.py | 9 ++++++++- tests/catalog/test_s3tables.py | 12 ++++++++++++ 2 files changed, 20 insertions(+), 1 deletion(-) diff --git a/pyiceberg/catalog/s3tables.py b/pyiceberg/catalog/s3tables.py index 1fb384b21d..947157320d 100644 --- a/pyiceberg/catalog/s3tables.py +++ b/pyiceberg/catalog/s3tables.py @@ -294,7 +294,14 @@ def rename_table(self, from_identifier: Union[str, Identifier], to_identifier: U return super().rename_table(from_identifier, to_identifier) def table_exists(self, identifier: Union[str, Identifier]) -> bool: - return super().table_exists(identifier) + namespace, table_name = self._validate_database_and_table_identifier(identifier) + try: + self.s3tables.get_table( + tableBucketARN=self.table_bucket_arn, namespace=namespace, name=table_name + ) + except self.s3tables.exceptions.NotFoundException: + return False + return True def update_namespace_properties( self, namespace: Union[str, Identifier], removals: Optional[Set[str]] = None, updates: Properties = ... diff --git a/tests/catalog/test_s3tables.py b/tests/catalog/test_s3tables.py index d185bd795d..f33d04d953 100644 --- a/tests/catalog/test_s3tables.py +++ b/tests/catalog/test_s3tables.py @@ -91,6 +91,18 @@ def test_create_table(table_bucket_arn, database_name: str, table_name: str, tab assert table == catalog.load_table(identifier) +def test_table_exists(table_bucket_arn, database_name: str, table_name: str, table_schema_nested: Schema): + # setting FileIO to FsspecFileIO explicitly is required as pyarrwo does not work with S3 Table Buckets yet + properties = {"warehouse": table_bucket_arn, "py-io-impl": "pyiceberg.io.fsspec.FsspecFileIO"} + catalog = S3TableCatalog(name="test_s3tables_catalog", **properties) + identifier = (database_name, table_name) + + catalog.create_namespace(namespace=database_name) + assert not catalog.table_exists(identifier=identifier) + catalog.create_table(identifier=identifier, schema=table_schema_nested) + assert catalog.table_exists(identifier=identifier) + + def test_drop_table(table_bucket_arn, database_name: str, table_name: str, table_schema_nested: Schema): # setting FileIO to FsspecFileIO explicitly is required as pyarrwo does not work with S3 Table Buckets yet properties = {"warehouse": table_bucket_arn, "py-io-impl": "pyiceberg.io.fsspec.FsspecFileIO"} From d660fce8bec2539d0800056ae683a12a756f4af1 Mon Sep 17 00:00:00 2001 From: Felix Scherz Date: Wed, 18 Dec 2024 21:53:07 +0100 Subject: [PATCH 08/60] feat: implement list_tables --- pyiceberg/catalog/s3tables.py | 7 ++++++- tests/catalog/test_s3tables.py | 12 ++++++++++++ 2 files changed, 18 insertions(+), 1 deletion(-) diff --git a/pyiceberg/catalog/s3tables.py b/pyiceberg/catalog/s3tables.py index 947157320d..047c689afd 100644 --- a/pyiceberg/catalog/s3tables.py +++ b/pyiceberg/catalog/s3tables.py @@ -253,7 +253,12 @@ def list_namespaces(self, namespace: Union[str, Identifier] = ...) -> List[Ident return [tuple(namespace["namespace"]) for namespace in response["namespaces"]] def list_tables(self, namespace: Union[str, Identifier]) -> List[Identifier]: - return super().list_tables(namespace) + namespace = self._validate_namespace_identifier(namespace) + paginator = self.s3tables.get_paginator("list_tables") + tables: List[str] = [] + for page in paginator.paginate(tableBucketARN=self.table_bucket_arn, namespace=namespace): + tables.extend(table["name"] for table in page["tables"]) + return tables def list_views(self, namespace: Union[str, Identifier]) -> List[Identifier]: return super().list_views(namespace) diff --git a/tests/catalog/test_s3tables.py b/tests/catalog/test_s3tables.py index f33d04d953..e74bddc973 100644 --- a/tests/catalog/test_s3tables.py +++ b/tests/catalog/test_s3tables.py @@ -103,6 +103,18 @@ def test_table_exists(table_bucket_arn, database_name: str, table_name: str, tab assert catalog.table_exists(identifier=identifier) +def test_list_tables(table_bucket_arn, database_name: str, table_name: str, table_schema_nested: Schema): + # setting FileIO to FsspecFileIO explicitly is required as pyarrwo does not work with S3 Table Buckets yet + properties = {"warehouse": table_bucket_arn, "py-io-impl": "pyiceberg.io.fsspec.FsspecFileIO"} + catalog = S3TableCatalog(name="test_s3tables_catalog", **properties) + identifier = (database_name, table_name) + + catalog.create_namespace(namespace=database_name) + assert not catalog.list_tables(namespace=database_name) + catalog.create_table(identifier=identifier, schema=table_schema_nested) + assert catalog.list_tables(namespace=database_name) + + def test_drop_table(table_bucket_arn, database_name: str, table_name: str, table_schema_nested: Schema): # setting FileIO to FsspecFileIO explicitly is required as pyarrwo does not work with S3 Table Buckets yet properties = {"warehouse": table_bucket_arn, "py-io-impl": "pyiceberg.io.fsspec.FsspecFileIO"} From 55300733ab1ef1a0f2d2be49c8c9ad0d0e8b58d0 Mon Sep 17 00:00:00 2001 From: Felix Scherz Date: Wed, 18 Dec 2024 21:59:14 +0100 Subject: [PATCH 09/60] refactor: improve list_namespace --- pyiceberg/catalog/s3tables.py | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/pyiceberg/catalog/s3tables.py b/pyiceberg/catalog/s3tables.py index 047c689afd..9e223dcc7a 100644 --- a/pyiceberg/catalog/s3tables.py +++ b/pyiceberg/catalog/s3tables.py @@ -247,10 +247,17 @@ def drop_table(self, identifier: Union[str, Identifier]) -> None: def drop_view(self, identifier: Union[str, Identifier]) -> None: return super().drop_view(identifier) - def list_namespaces(self, namespace: Union[str, Identifier] = ...) -> List[Identifier]: - # TODO: handle pagination - response = self.s3tables.list_namespaces(tableBucketARN=self.table_bucket_arn) - return [tuple(namespace["namespace"]) for namespace in response["namespaces"]] + def list_namespaces(self, namespace: Union[str, Identifier] = ()) -> List[Identifier]: + # TODO: s3tables only support single level namespaces + if namespace: + namespace = self._validate_namespace_identifier(namespace) + paginator = self.s3tables.get_paginator("list_namespaces") + + namespaces: List[Identifier] = [] + for page in paginator.paginate(tableBucketARN=self.table_bucket_arn): + namespaces.extend(tuple(entry["namespace"]) for entry in page["namespaces"]) + + return namespaces def list_tables(self, namespace: Union[str, Identifier]) -> List[Identifier]: namespace = self._validate_namespace_identifier(namespace) From 0399a94304b5ab1d8253bf234a121d2a22d5de15 Mon Sep 17 00:00:00 2001 From: Felix Scherz Date: Wed, 18 Dec 2024 21:59:50 +0100 Subject: [PATCH 10/60] fix: return Identifier from list_tables --- pyiceberg/catalog/s3tables.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pyiceberg/catalog/s3tables.py b/pyiceberg/catalog/s3tables.py index 9e223dcc7a..f974c354bd 100644 --- a/pyiceberg/catalog/s3tables.py +++ b/pyiceberg/catalog/s3tables.py @@ -262,9 +262,9 @@ def list_namespaces(self, namespace: Union[str, Identifier] = ()) -> List[Identi def list_tables(self, namespace: Union[str, Identifier]) -> List[Identifier]: namespace = self._validate_namespace_identifier(namespace) paginator = self.s3tables.get_paginator("list_tables") - tables: List[str] = [] + tables: List[Identifier] = [] for page in paginator.paginate(tableBucketARN=self.table_bucket_arn, namespace=namespace): - tables.extend(table["name"] for table in page["tables"]) + tables.extend((namespace, table["name"]) for table in page["tables"]) return tables def list_views(self, namespace: Union[str, Identifier]) -> List[Identifier]: From 89764c8403c8e562c5d2ea1f0321f2f636c1bc6f Mon Sep 17 00:00:00 2001 From: Felix Scherz Date: Sat, 21 Dec 2024 14:22:25 +0100 Subject: [PATCH 11/60] feat: implement rename table --- pyiceberg/catalog/s3tables.py | 22 ++++++++++++++++++---- tests/catalog/test_s3tables.py | 19 +++++++++++++++++++ 2 files changed, 37 insertions(+), 4 deletions(-) diff --git a/pyiceberg/catalog/s3tables.py b/pyiceberg/catalog/s3tables.py index f974c354bd..54d57e347d 100644 --- a/pyiceberg/catalog/s3tables.py +++ b/pyiceberg/catalog/s3tables.py @@ -303,14 +303,28 @@ def register_table(self, identifier: Union[str, Identifier], metadata_location: return super().register_table(identifier, metadata_location) def rename_table(self, from_identifier: Union[str, Identifier], to_identifier: Union[str, Identifier]) -> Table: - return super().rename_table(from_identifier, to_identifier) + from_namespace, from_table_name = self._validate_database_and_table_identifier(from_identifier) + to_namespace, to_table_name = self._validate_database_and_table_identifier(to_identifier) + + version_token = self.s3tables.get_table( + tableBucketARN=self.table_bucket_arn, namespace=from_namespace, name=from_table_name + )["versionToken"] + + self.s3tables.rename_table( + tableBucketARN=self.table_bucket_arn, + namespace=from_namespace, + name=from_table_name, + newNamespaceName=to_namespace, + newName=to_table_name, + versionToken=version_token + ) + + return self.load_table(to_identifier) def table_exists(self, identifier: Union[str, Identifier]) -> bool: namespace, table_name = self._validate_database_and_table_identifier(identifier) try: - self.s3tables.get_table( - tableBucketARN=self.table_bucket_arn, namespace=namespace, name=table_name - ) + self.s3tables.get_table(tableBucketARN=self.table_bucket_arn, namespace=namespace, name=table_name) except self.s3tables.exceptions.NotFoundException: return False return True diff --git a/tests/catalog/test_s3tables.py b/tests/catalog/test_s3tables.py index e74bddc973..3750b8a357 100644 --- a/tests/catalog/test_s3tables.py +++ b/tests/catalog/test_s3tables.py @@ -103,6 +103,25 @@ def test_table_exists(table_bucket_arn, database_name: str, table_name: str, tab assert catalog.table_exists(identifier=identifier) +def test_rename_table(table_bucket_arn, database_name: str, table_name: str, table_schema_nested: Schema): + # setting FileIO to FsspecFileIO explicitly is required as pyarrwo does not work with S3 Table Buckets yet + properties = {"warehouse": table_bucket_arn, "py-io-impl": "pyiceberg.io.fsspec.FsspecFileIO"} + catalog = S3TableCatalog(name="test_s3tables_catalog", **properties) + identifier = (database_name, table_name) + + catalog.create_namespace(namespace=database_name) + catalog.create_table(identifier=identifier, schema=table_schema_nested) + + to_database_name = f"{database_name}new" + to_table_name = f"{table_name}new" + to_identifier = (to_database_name, to_table_name) + catalog.create_namespace(namespace=to_database_name) + catalog.rename_table(from_identifier=identifier, to_identifier=to_identifier) + + assert not catalog.table_exists(identifier=identifier) + assert catalog.table_exists(identifier=to_identifier) + + def test_list_tables(table_bucket_arn, database_name: str, table_name: str, table_schema_nested: Schema): # setting FileIO to FsspecFileIO explicitly is required as pyarrwo does not work with S3 Table Buckets yet properties = {"warehouse": table_bucket_arn, "py-io-impl": "pyiceberg.io.fsspec.FsspecFileIO"} From 8fd19463b48a4077ade1d0f5af4620bd1bafed6b Mon Sep 17 00:00:00 2001 From: Felix Scherz Date: Sat, 21 Dec 2024 14:32:03 +0100 Subject: [PATCH 12/60] feat: implement load_namespace_properties --- pyiceberg/catalog/s3tables.py | 11 +++++++++-- tests/catalog/test_s3tables.py | 9 +++++++-- 2 files changed, 16 insertions(+), 4 deletions(-) diff --git a/pyiceberg/catalog/s3tables.py b/pyiceberg/catalog/s3tables.py index 54d57e347d..c77651444b 100644 --- a/pyiceberg/catalog/s3tables.py +++ b/pyiceberg/catalog/s3tables.py @@ -271,7 +271,14 @@ def list_views(self, namespace: Union[str, Identifier]) -> List[Identifier]: return super().list_views(namespace) def load_namespace_properties(self, namespace: Union[str, Identifier]) -> Properties: - return super().load_namespace_properties(namespace) + namespace = self._validate_namespace_identifier(namespace) + response = self.s3tables.get_namespace(tableBucketARN=self.table_bucket_arn, namespace=namespace) + return { + "namespace": response["namespace"], + "createdAt": response["createdAt"], + "createdBy": response["createdBy"], + "ownerAccountId": response["ownerAccountId"], + } def load_table(self, identifier: Union[str, Identifier]) -> Table: namespace, table_name = self._validate_database_and_table_identifier(identifier) @@ -316,7 +323,7 @@ def rename_table(self, from_identifier: Union[str, Identifier], to_identifier: U name=from_table_name, newNamespaceName=to_namespace, newName=to_table_name, - versionToken=version_token + versionToken=version_token, ) return self.load_table(to_identifier) diff --git a/tests/catalog/test_s3tables.py b/tests/catalog/test_s3tables.py index 3750b8a357..5b7393b3a0 100644 --- a/tests/catalog/test_s3tables.py +++ b/tests/catalog/test_s3tables.py @@ -70,6 +70,13 @@ def test_create_namespace(table_bucket_arn, database_name: str): assert (database_name,) in namespaces +def test_load_namespace_properties(table_bucket_arn, database_name: str): + properties = {"warehouse": table_bucket_arn} + catalog = S3TableCatalog(name="test_s3tables_catalog", **properties) + catalog.create_namespace(namespace=database_name) + assert database_name in catalog.load_namespace_properties(database_name)["namespace"] + + def test_drop_namespace(table_bucket_arn, database_name: str): properties = {"warehouse": table_bucket_arn} catalog = S3TableCatalog(name="test_s3tables_catalog", **properties) @@ -162,8 +169,6 @@ def test_commit_table(table_bucket_arn, database_name: str, table_name: str, tab original_table_metadata_location = table.metadata_location original_table_last_updated_ms = table.metadata.last_updated_ms - - transaction = table.transaction() update = transaction.update_schema() update.add_column(path="b", field_type=IntegerType()) From a2aae10025cbaae69f1bf80abbd4b1c87b936cc4 Mon Sep 17 00:00:00 2001 From: Felix Scherz Date: Sat, 21 Dec 2024 14:38:07 +0100 Subject: [PATCH 13/60] refactor: move some methods around --- pyiceberg/catalog/s3tables.py | 47 ++++++++++++++++++----------------- 1 file changed, 24 insertions(+), 23 deletions(-) diff --git a/pyiceberg/catalog/s3tables.py b/pyiceberg/catalog/s3tables.py index c77651444b..2370a5099c 100644 --- a/pyiceberg/catalog/s3tables.py +++ b/pyiceberg/catalog/s3tables.py @@ -10,7 +10,6 @@ from pyiceberg.catalog import DEPRECATED_BOTOCORE_SESSION from pyiceberg.catalog import WAREHOUSE_LOCATION -from pyiceberg.catalog import Catalog from pyiceberg.catalog import MetastoreCatalog from pyiceberg.catalog import PropertiesUpdateSummary from pyiceberg.exceptions import CommitFailedException @@ -157,6 +156,7 @@ def _validate_database_and_table_identifier(self, identifier: Union[str, Identif namespace = self._validate_namespace_identifier(namespace) if not pattern.fullmatch(table_name): + # TODO: raise proper errors for invalid table_name ... return namespace, table_name @@ -209,16 +209,6 @@ def create_table( return self.load_table(identifier=identifier) - def create_table_transaction( - self, - identifier: Union[str, Identifier], - schema: Union[Schema, "pa.Schema"], - location: Optional[str] = None, - partition_spec: PartitionSpec = ..., - sort_order: SortOrder = ..., - properties: Properties = ..., - ) -> CreateTableTransaction: - return super().create_table_transaction(identifier, schema, location, partition_spec, sort_order, properties) def drop_namespace(self, namespace: Union[str, Identifier]) -> None: namespace = self._validate_namespace_identifier(namespace) @@ -244,9 +234,6 @@ def drop_table(self, identifier: Union[str, Identifier]) -> None: versionToken=response["versionToken"], ) - def drop_view(self, identifier: Union[str, Identifier]) -> None: - return super().drop_view(identifier) - def list_namespaces(self, namespace: Union[str, Identifier] = ()) -> List[Identifier]: # TODO: s3tables only support single level namespaces if namespace: @@ -267,9 +254,6 @@ def list_tables(self, namespace: Union[str, Identifier]) -> List[Identifier]: tables.extend((namespace, table["name"]) for table in page["tables"]) return tables - def list_views(self, namespace: Union[str, Identifier]) -> List[Identifier]: - return super().list_views(namespace) - def load_namespace_properties(self, namespace: Union[str, Identifier]) -> Properties: namespace = self._validate_namespace_identifier(namespace) response = self.s3tables.get_namespace(tableBucketARN=self.table_bucket_arn, namespace=namespace) @@ -303,12 +287,6 @@ def load_table(self, identifier: Union[str, Identifier]) -> Table: catalog=self, ) - def purge_table(self, identifier: Union[str, Identifier]) -> None: - return super().purge_table(identifier) - - def register_table(self, identifier: Union[str, Identifier], metadata_location: str) -> Table: - return super().register_table(identifier, metadata_location) - def rename_table(self, from_identifier: Union[str, Identifier], to_identifier: Union[str, Identifier]) -> Table: from_namespace, from_table_name = self._validate_database_and_table_identifier(from_identifier) to_namespace, to_table_name = self._validate_database_and_table_identifier(to_identifier) @@ -340,3 +318,26 @@ def update_namespace_properties( self, namespace: Union[str, Identifier], removals: Optional[Set[str]] = None, updates: Properties = ... ) -> PropertiesUpdateSummary: return super().update_namespace_properties(namespace, removals, updates) + + def create_table_transaction( + self, + identifier: Union[str, Identifier], + schema: Union[Schema, "pa.Schema"], + location: Optional[str] = None, + partition_spec: PartitionSpec = ..., + sort_order: SortOrder = ..., + properties: Properties = ..., + ) -> CreateTableTransaction: + return super().create_table_transaction(identifier, schema, location, partition_spec, sort_order, properties) + + def purge_table(self, identifier: Union[str, Identifier]) -> None: + return super().purge_table(identifier) + + def register_table(self, identifier: Union[str, Identifier], metadata_location: str) -> Table: + return super().register_table(identifier, metadata_location) + + def drop_view(self, identifier: Union[str, Identifier]) -> None: + return super().drop_view(identifier) + + def list_views(self, namespace: Union[str, Identifier]) -> List[Identifier]: + return super().list_views(namespace) From 287d5363ab9383cb7514af1a4d262c7dcd30e3fa Mon Sep 17 00:00:00 2001 From: Felix Scherz Date: Sat, 21 Dec 2024 14:39:04 +0100 Subject: [PATCH 14/60] feat: raise NotImplementedError for views functionality --- pyiceberg/catalog/s3tables.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pyiceberg/catalog/s3tables.py b/pyiceberg/catalog/s3tables.py index 2370a5099c..bb0007b796 100644 --- a/pyiceberg/catalog/s3tables.py +++ b/pyiceberg/catalog/s3tables.py @@ -337,7 +337,7 @@ def register_table(self, identifier: Union[str, Identifier], metadata_location: return super().register_table(identifier, metadata_location) def drop_view(self, identifier: Union[str, Identifier]) -> None: - return super().drop_view(identifier) + raise NotImplementedError def list_views(self, namespace: Union[str, Identifier]) -> List[Identifier]: - return super().list_views(namespace) + raise NotImplementedError From 954d11965a4c5b8ce2ac5b41546ad8db5e4ce43f Mon Sep 17 00:00:00 2001 From: Felix Scherz Date: Sat, 21 Dec 2024 14:43:11 +0100 Subject: [PATCH 15/60] feat: raise NotImplementedError for purge_table --- pyiceberg/catalog/s3tables.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pyiceberg/catalog/s3tables.py b/pyiceberg/catalog/s3tables.py index bb0007b796..c91f76eb08 100644 --- a/pyiceberg/catalog/s3tables.py +++ b/pyiceberg/catalog/s3tables.py @@ -331,7 +331,9 @@ def create_table_transaction( return super().create_table_transaction(identifier, schema, location, partition_spec, sort_order, properties) def purge_table(self, identifier: Union[str, Identifier]) -> None: - return super().purge_table(identifier) + # purge is not supported as s3tables doesn't support delete operations + # table maintenance is automated + raise NotImplementedError def register_table(self, identifier: Union[str, Identifier], metadata_location: str) -> Table: return super().register_table(identifier, metadata_location) From f3d2b2fdbf7bc90e66534aca00f9393d5990fb26 Mon Sep 17 00:00:00 2001 From: Felix Scherz Date: Sat, 21 Dec 2024 14:52:50 +0100 Subject: [PATCH 16/60] feat: raise NotImplementedError for update_namespace_properties --- pyiceberg/catalog/s3tables.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pyiceberg/catalog/s3tables.py b/pyiceberg/catalog/s3tables.py index c91f76eb08..058d2acc59 100644 --- a/pyiceberg/catalog/s3tables.py +++ b/pyiceberg/catalog/s3tables.py @@ -315,9 +315,10 @@ def table_exists(self, identifier: Union[str, Identifier]) -> bool: return True def update_namespace_properties( - self, namespace: Union[str, Identifier], removals: Optional[Set[str]] = None, updates: Properties = ... + self, namespace: Union[str, Identifier], removals: Optional[Set[str]] = None, updates: Properties = EMPTY_DICT ) -> PropertiesUpdateSummary: - return super().update_namespace_properties(namespace, removals, updates) + # namespace properties are read only + raise NotImplementedError def create_table_transaction( self, From 202e98bff70bd54702f4bb3a584468e40616806e Mon Sep 17 00:00:00 2001 From: Felix Scherz Date: Sat, 21 Dec 2024 15:02:57 +0100 Subject: [PATCH 17/60] feat: raise NotImplementedError for register_table --- pyiceberg/catalog/s3tables.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyiceberg/catalog/s3tables.py b/pyiceberg/catalog/s3tables.py index 058d2acc59..3c4a1efda7 100644 --- a/pyiceberg/catalog/s3tables.py +++ b/pyiceberg/catalog/s3tables.py @@ -337,7 +337,7 @@ def purge_table(self, identifier: Union[str, Identifier]) -> None: raise NotImplementedError def register_table(self, identifier: Union[str, Identifier], metadata_location: str) -> Table: - return super().register_table(identifier, metadata_location) + raise NotImplementedError def drop_view(self, identifier: Union[str, Identifier]) -> None: raise NotImplementedError From 1731fcbcdeae64219acf1dc446660a8f4bb284df Mon Sep 17 00:00:00 2001 From: Felix Scherz Date: Sat, 21 Dec 2024 15:10:51 +0100 Subject: [PATCH 18/60] fix: don't override create_table_transaction --- pyiceberg/catalog/s3tables.py | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/pyiceberg/catalog/s3tables.py b/pyiceberg/catalog/s3tables.py index 3c4a1efda7..555e9783e0 100644 --- a/pyiceberg/catalog/s3tables.py +++ b/pyiceberg/catalog/s3tables.py @@ -320,17 +320,6 @@ def update_namespace_properties( # namespace properties are read only raise NotImplementedError - def create_table_transaction( - self, - identifier: Union[str, Identifier], - schema: Union[Schema, "pa.Schema"], - location: Optional[str] = None, - partition_spec: PartitionSpec = ..., - sort_order: SortOrder = ..., - properties: Properties = ..., - ) -> CreateTableTransaction: - return super().create_table_transaction(identifier, schema, location, partition_spec, sort_order, properties) - def purge_table(self, identifier: Union[str, Identifier]) -> None: # purge is not supported as s3tables doesn't support delete operations # table maintenance is automated From 088cf0e05bbff83d40d8ab246e133d6f8600261a Mon Sep 17 00:00:00 2001 From: Felix Scherz Date: Sat, 21 Dec 2024 15:15:57 +0100 Subject: [PATCH 19/60] chore: run formatter --- pyiceberg/catalog/s3tables.py | 68 ++++++++++------------------------ pyiceberg/exceptions.py | 3 ++ tests/catalog/test_s3tables.py | 7 +--- 3 files changed, 25 insertions(+), 53 deletions(-) diff --git a/pyiceberg/catalog/s3tables.py b/pyiceberg/catalog/s3tables.py index 555e9783e0..b4aa7283b5 100644 --- a/pyiceberg/catalog/s3tables.py +++ b/pyiceberg/catalog/s3tables.py @@ -1,43 +1,24 @@ import re -from typing import TYPE_CHECKING -from typing import List -from typing import Optional -from typing import Set -from typing import Tuple -from typing import Union +from typing import TYPE_CHECKING, List, Optional, Set, Tuple, Union import boto3 -from pyiceberg.catalog import DEPRECATED_BOTOCORE_SESSION -from pyiceberg.catalog import WAREHOUSE_LOCATION -from pyiceberg.catalog import MetastoreCatalog -from pyiceberg.catalog import PropertiesUpdateSummary -from pyiceberg.exceptions import CommitFailedException -from pyiceberg.exceptions import NamespaceNotEmptyError -from pyiceberg.exceptions import NoSuchTableError -from pyiceberg.exceptions import S3TablesError -from pyiceberg.exceptions import TableBucketNotFound -from pyiceberg.io import AWS_ACCESS_KEY_ID -from pyiceberg.io import AWS_REGION -from pyiceberg.io import AWS_SECRET_ACCESS_KEY -from pyiceberg.io import AWS_SESSION_TOKEN -from pyiceberg.io import PY_IO_IMPL -from pyiceberg.io import load_file_io -from pyiceberg.partitioning import UNPARTITIONED_PARTITION_SPEC -from pyiceberg.partitioning import PartitionSpec +from pyiceberg.catalog import DEPRECATED_BOTOCORE_SESSION, WAREHOUSE_LOCATION, MetastoreCatalog, PropertiesUpdateSummary +from pyiceberg.exceptions import ( + CommitFailedException, + NamespaceNotEmptyError, + NoSuchTableError, + TableBucketNotFound, +) +from pyiceberg.io import AWS_ACCESS_KEY_ID, AWS_REGION, AWS_SECRET_ACCESS_KEY, AWS_SESSION_TOKEN, PY_IO_IMPL, load_file_io +from pyiceberg.partitioning import UNPARTITIONED_PARTITION_SPEC, PartitionSpec from pyiceberg.schema import Schema from pyiceberg.serializers import FromInputFile -from pyiceberg.table import CommitTableResponse -from pyiceberg.table import CreateTableTransaction -from pyiceberg.table import Table -from pyiceberg.table import TableRequirement +from pyiceberg.table import CommitTableResponse, Table, TableRequirement from pyiceberg.table.metadata import new_table_metadata -from pyiceberg.table.sorting import UNSORTED_SORT_ORDER -from pyiceberg.table.sorting import SortOrder +from pyiceberg.table.sorting import UNSORTED_SORT_ORDER, SortOrder from pyiceberg.table.update import TableUpdate -from pyiceberg.typedef import EMPTY_DICT -from pyiceberg.typedef import Identifier -from pyiceberg.typedef import Properties +from pyiceberg.typedef import EMPTY_DICT, Identifier, Properties from pyiceberg.utils.properties import get_first_property_value if TYPE_CHECKING: @@ -66,9 +47,7 @@ def __init__(self, name: str, **properties: str): region_name=get_first_property_value(properties, S3TABLES_REGION, AWS_REGION), botocore_session=properties.get(DEPRECATED_BOTOCORE_SESSION), aws_access_key_id=get_first_property_value(properties, S3TABLES_ACCESS_KEY_ID, AWS_ACCESS_KEY_ID), - aws_secret_access_key=get_first_property_value( - properties, S3TABLES_SECRET_ACCESS_KEY, AWS_SECRET_ACCESS_KEY - ), + aws_secret_access_key=get_first_property_value(properties, S3TABLES_SECRET_ACCESS_KEY, AWS_SECRET_ACCESS_KEY), aws_session_token=get_first_property_value(properties, S3TABLES_SESSION_TOKEN, AWS_SESSION_TOKEN), ) # TODO: s3tables client only supported from boto3>=1.35.74 so this can crash @@ -91,16 +70,14 @@ def commit_table( # the table can change between those two API calls without noticing # -> change this into an internal API that returns table information along with versionToken current_table = self.load_table(identifier=table_identifier) - version_token = self.s3tables.get_table( - tableBucketARN=self.table_bucket_arn, namespace=database_name, name=table_name - )["versionToken"] + version_token = self.s3tables.get_table(tableBucketARN=self.table_bucket_arn, namespace=database_name, name=table_name)[ + "versionToken" + ] updated_staged_table = self._update_and_stage_table(current_table, table_identifier, requirements, updates) if current_table and updated_staged_table.metadata == current_table.metadata: # no changes, do nothing - return CommitTableResponse( - metadata=current_table.metadata, metadata_location=current_table.metadata_location - ) + return CommitTableResponse(metadata=current_table.metadata, metadata_location=current_table.metadata_location) self._write_metadata( metadata=updated_staged_table.metadata, @@ -175,9 +152,7 @@ def create_table( schema: Schema = self._convert_schema_if_needed(schema) # type: ignore # TODO: check whether namespace exists and if it does, whether table_name already exists - self.s3tables.create_table( - tableBucketARN=self.table_bucket_arn, namespace=namespace, name=table_name, format="ICEBERG" - ) + self.s3tables.create_table(tableBucketARN=self.table_bucket_arn, namespace=namespace, name=table_name, format="ICEBERG") # location is given by s3 table bucket response = self.s3tables.get_table(tableBucketARN=self.table_bucket_arn, namespace=namespace, name=table_name) @@ -209,7 +184,6 @@ def create_table( return self.load_table(identifier=identifier) - def drop_namespace(self, namespace: Union[str, Identifier]) -> None: namespace = self._validate_namespace_identifier(namespace) try: @@ -220,9 +194,7 @@ def drop_namespace(self, namespace: Union[str, Identifier]) -> None: def drop_table(self, identifier: Union[str, Identifier]) -> None: namespace, table_name = self._validate_database_and_table_identifier(identifier) try: - response = self.s3tables.get_table( - tableBucketARN=self.table_bucket_arn, namespace=namespace, name=table_name - ) + response = self.s3tables.get_table(tableBucketARN=self.table_bucket_arn, namespace=namespace, name=table_name) except self.s3tables.exceptions.NotFoundException as e: raise NoSuchTableError(f"No table with identifier {identifier} exists.") from e diff --git a/pyiceberg/exceptions.py b/pyiceberg/exceptions.py index 6096096f45..f633e4c143 100644 --- a/pyiceberg/exceptions.py +++ b/pyiceberg/exceptions.py @@ -111,12 +111,15 @@ class ConditionalCheckFailedException(DynamoDbError): class GenericDynamoDbError(DynamoDbError): pass + class S3TablesError(Exception): pass + class TableBucketNotFound(S3TablesError): pass + class CommitFailedException(Exception): """Commit failed, refresh and try again.""" diff --git a/tests/catalog/test_s3tables.py b/tests/catalog/test_s3tables.py index 5b7393b3a0..2cbdca15c3 100644 --- a/tests/catalog/test_s3tables.py +++ b/tests/catalog/test_s3tables.py @@ -4,8 +4,7 @@ import pytest from pyiceberg.catalog.s3tables import S3TableCatalog -from pyiceberg.exceptions import NoSuchTableError -from pyiceberg.exceptions import TableBucketNotFound +from pyiceberg.exceptions import NoSuchTableError, TableBucketNotFound from pyiceberg.schema import Schema from pyiceberg.types import IntegerType @@ -35,9 +34,7 @@ def table_bucket_arn(): def test_s3tables_api_raises_on_conflicting_version_tokens(table_bucket_arn, database_name, table_name): client = boto3.client("s3tables") client.create_namespace(tableBucketARN=table_bucket_arn, namespace=[database_name]) - response = client.create_table( - tableBucketARN=table_bucket_arn, namespace=database_name, name=table_name, format="ICEBERG" - ) + response = client.create_table(tableBucketARN=table_bucket_arn, namespace=database_name, name=table_name, format="ICEBERG") version_token = response["versionToken"] scrambled_version_token = version_token[::-1] From 8bc01a24620722ab369a835621f2baf44f0497ce Mon Sep 17 00:00:00 2001 From: Felix Scherz Date: Mon, 23 Dec 2024 14:19:27 +0100 Subject: [PATCH 20/60] feat: raise exceptions if boto3 doesn't support s3tables --- pyiceberg/catalog/s3tables.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/pyiceberg/catalog/s3tables.py b/pyiceberg/catalog/s3tables.py index b4aa7283b5..b8009191b2 100644 --- a/pyiceberg/catalog/s3tables.py +++ b/pyiceberg/catalog/s3tables.py @@ -2,6 +2,7 @@ from typing import TYPE_CHECKING, List, Optional, Set, Tuple, Union import boto3 +from boto3.session import UnknownServiceError from pyiceberg.catalog import DEPRECATED_BOTOCORE_SESSION, WAREHOUSE_LOCATION, MetastoreCatalog, PropertiesUpdateSummary from pyiceberg.exceptions import ( @@ -9,6 +10,7 @@ NamespaceNotEmptyError, NoSuchTableError, TableBucketNotFound, + S3TablesError ) from pyiceberg.io import AWS_ACCESS_KEY_ID, AWS_REGION, AWS_SECRET_ACCESS_KEY, AWS_SESSION_TOKEN, PY_IO_IMPL, load_file_io from pyiceberg.partitioning import UNPARTITIONED_PARTITION_SPEC, PartitionSpec @@ -50,9 +52,11 @@ def __init__(self, name: str, **properties: str): aws_secret_access_key=get_first_property_value(properties, S3TABLES_SECRET_ACCESS_KEY, AWS_SECRET_ACCESS_KEY), aws_session_token=get_first_property_value(properties, S3TABLES_SESSION_TOKEN, AWS_SESSION_TOKEN), ) - # TODO: s3tables client only supported from boto3>=1.35.74 so this can crash - # TODO: set a custom user-agent for api calls like the Java implementation - self.s3tables = session.client("s3tables") + try: + self.s3tables = session.client("s3tables") + except UnknownServiceError as e: + raise S3TablesError("'s3tables' requires boto3>=1.35.74. Current version: {boto3.__version__}.") from e + # TODO: handle malformed properties instead of just raising a key error here self.table_bucket_arn = self.properties[WAREHOUSE_LOCATION] try: From 9b8f0bdcd5ff9da6c1cc17059707560bfc04729e Mon Sep 17 00:00:00 2001 From: Felix Scherz Date: Mon, 23 Dec 2024 14:24:55 +0100 Subject: [PATCH 21/60] feat: make endpoint configurable --- pyiceberg/catalog/s3tables.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyiceberg/catalog/s3tables.py b/pyiceberg/catalog/s3tables.py index b8009191b2..ab3e77a016 100644 --- a/pyiceberg/catalog/s3tables.py +++ b/pyiceberg/catalog/s3tables.py @@ -53,7 +53,7 @@ def __init__(self, name: str, **properties: str): aws_session_token=get_first_property_value(properties, S3TABLES_SESSION_TOKEN, AWS_SESSION_TOKEN), ) try: - self.s3tables = session.client("s3tables") + self.s3tables = session.client("s3tables", endpoint_url=properties.get(S3TABLES_ENDPOINT)) except UnknownServiceError as e: raise S3TablesError("'s3tables' requires boto3>=1.35.74. Current version: {boto3.__version__}.") from e From 4973087240779aef140732b71e02b933ea040035 Mon Sep 17 00:00:00 2001 From: Felix Scherz Date: Mon, 23 Dec 2024 14:34:53 +0100 Subject: [PATCH 22/60] feat: explicitly configure tableBucketARN --- pyiceberg/catalog/s3tables.py | 6 ++-- tests/catalog/test_s3tables.py | 50 +++++++++++----------------------- 2 files changed, 20 insertions(+), 36 deletions(-) diff --git a/pyiceberg/catalog/s3tables.py b/pyiceberg/catalog/s3tables.py index ab3e77a016..4675a9cf76 100644 --- a/pyiceberg/catalog/s3tables.py +++ b/pyiceberg/catalog/s3tables.py @@ -32,6 +32,8 @@ S3TABLES_SECRET_ACCESS_KEY = "s3tables.secret-access-key" S3TABLES_SESSION_TOKEN = "s3tables.session-token" +S3TABLES_TABLE_BUCKET_ARN = "s3tables.table-bucket-arn" + S3TABLES_ENDPOINT = "s3tables.endpoint" # pyarrow does not support writing to S3 Table buckets as of 2024-12-14 https://github.com/apache/iceberg-python/issues/1404#issuecomment-2543174146 @@ -44,6 +46,8 @@ def __init__(self, name: str, **properties: str): # TODO: implement a proper check for FileIO self.properties[PY_IO_IMPL] = S3TABLES_FILE_IO_DEFAULT + self.table_bucket_arn = self.properties[S3TABLES_TABLE_BUCKET_ARN] + session = boto3.Session( profile_name=properties.get(S3TABLES_PROFILE_NAME), region_name=get_first_property_value(properties, S3TABLES_REGION, AWS_REGION), @@ -57,8 +61,6 @@ def __init__(self, name: str, **properties: str): except UnknownServiceError as e: raise S3TablesError("'s3tables' requires boto3>=1.35.74. Current version: {boto3.__version__}.") from e - # TODO: handle malformed properties instead of just raising a key error here - self.table_bucket_arn = self.properties[WAREHOUSE_LOCATION] try: self.s3tables.get_table_bucket(tableBucketARN=self.table_bucket_arn) except self.s3tables.exceptions.NotFoundException as e: diff --git a/tests/catalog/test_s3tables.py b/tests/catalog/test_s3tables.py index 2cbdca15c3..67cdeb21c8 100644 --- a/tests/catalog/test_s3tables.py +++ b/tests/catalog/test_s3tables.py @@ -30,6 +30,12 @@ def table_bucket_arn(): return os.environ["ARN"] +@pytest.fixture +def catalog(table_bucket_arn): + # setting FileIO to FsspecFileIO explicitly is required as pyarrow does not work with S3 Table Buckets yet + properties = {"s3tables.table-bucket-arn": table_bucket_arn, "py-io-impl": "pyiceberg.io.fsspec.FsspecFileIO"} + return S3TableCatalog(name="test_s3tables_catalog", **properties) + def test_s3tables_api_raises_on_conflicting_version_tokens(table_bucket_arn, database_name, table_name): client = boto3.client("s3tables") @@ -54,39 +60,30 @@ def test_s3tables_api_raises_on_conflicting_version_tokens(table_bucket_arn, dat def test_creating_catalog_validates_s3_table_bucket_exists(table_bucket_arn): - properties = {"warehouse": f"{table_bucket_arn}-modified"} + properties = {"s3tables.table-bucket-arn": f"{table_bucket_arn}-modified"} with pytest.raises(TableBucketNotFound): S3TableCatalog(name="test_s3tables_catalog", **properties) -def test_create_namespace(table_bucket_arn, database_name: str): - properties = {"warehouse": table_bucket_arn} - catalog = S3TableCatalog(name="test_s3tables_catalog", **properties) +def test_create_namespace(catalog, database_name: str): catalog.create_namespace(namespace=database_name) namespaces = catalog.list_namespaces() assert (database_name,) in namespaces -def test_load_namespace_properties(table_bucket_arn, database_name: str): - properties = {"warehouse": table_bucket_arn} - catalog = S3TableCatalog(name="test_s3tables_catalog", **properties) +def test_load_namespace_properties(catalog, database_name: str): catalog.create_namespace(namespace=database_name) assert database_name in catalog.load_namespace_properties(database_name)["namespace"] -def test_drop_namespace(table_bucket_arn, database_name: str): - properties = {"warehouse": table_bucket_arn} - catalog = S3TableCatalog(name="test_s3tables_catalog", **properties) +def test_drop_namespace(catalog, database_name: str): catalog.create_namespace(namespace=database_name) assert (database_name,) in catalog.list_namespaces() catalog.drop_namespace(namespace=database_name) assert (database_name,) not in catalog.list_namespaces() -def test_create_table(table_bucket_arn, database_name: str, table_name: str, table_schema_nested: Schema): - # setting FileIO to FsspecFileIO explicitly is required as pyarrwo does not work with S3 Table Buckets yet - properties = {"warehouse": table_bucket_arn, "py-io-impl": "pyiceberg.io.fsspec.FsspecFileIO"} - catalog = S3TableCatalog(name="test_s3tables_catalog", **properties) +def test_create_table(catalog, database_name: str, table_name: str, table_schema_nested: Schema): identifier = (database_name, table_name) catalog.create_namespace(namespace=database_name) @@ -95,10 +92,7 @@ def test_create_table(table_bucket_arn, database_name: str, table_name: str, tab assert table == catalog.load_table(identifier) -def test_table_exists(table_bucket_arn, database_name: str, table_name: str, table_schema_nested: Schema): - # setting FileIO to FsspecFileIO explicitly is required as pyarrwo does not work with S3 Table Buckets yet - properties = {"warehouse": table_bucket_arn, "py-io-impl": "pyiceberg.io.fsspec.FsspecFileIO"} - catalog = S3TableCatalog(name="test_s3tables_catalog", **properties) +def test_table_exists(catalog, database_name: str, table_name: str, table_schema_nested: Schema): identifier = (database_name, table_name) catalog.create_namespace(namespace=database_name) @@ -107,10 +101,7 @@ def test_table_exists(table_bucket_arn, database_name: str, table_name: str, tab assert catalog.table_exists(identifier=identifier) -def test_rename_table(table_bucket_arn, database_name: str, table_name: str, table_schema_nested: Schema): - # setting FileIO to FsspecFileIO explicitly is required as pyarrwo does not work with S3 Table Buckets yet - properties = {"warehouse": table_bucket_arn, "py-io-impl": "pyiceberg.io.fsspec.FsspecFileIO"} - catalog = S3TableCatalog(name="test_s3tables_catalog", **properties) +def test_rename_table(catalog, database_name: str, table_name: str, table_schema_nested: Schema): identifier = (database_name, table_name) catalog.create_namespace(namespace=database_name) @@ -126,10 +117,7 @@ def test_rename_table(table_bucket_arn, database_name: str, table_name: str, tab assert catalog.table_exists(identifier=to_identifier) -def test_list_tables(table_bucket_arn, database_name: str, table_name: str, table_schema_nested: Schema): - # setting FileIO to FsspecFileIO explicitly is required as pyarrwo does not work with S3 Table Buckets yet - properties = {"warehouse": table_bucket_arn, "py-io-impl": "pyiceberg.io.fsspec.FsspecFileIO"} - catalog = S3TableCatalog(name="test_s3tables_catalog", **properties) +def test_list_tables(catalog, database_name: str, table_name: str, table_schema_nested: Schema): identifier = (database_name, table_name) catalog.create_namespace(namespace=database_name) @@ -138,10 +126,7 @@ def test_list_tables(table_bucket_arn, database_name: str, table_name: str, tabl assert catalog.list_tables(namespace=database_name) -def test_drop_table(table_bucket_arn, database_name: str, table_name: str, table_schema_nested: Schema): - # setting FileIO to FsspecFileIO explicitly is required as pyarrwo does not work with S3 Table Buckets yet - properties = {"warehouse": table_bucket_arn, "py-io-impl": "pyiceberg.io.fsspec.FsspecFileIO"} - catalog = S3TableCatalog(name="test_s3tables_catalog", **properties) +def test_drop_table(catalog, database_name: str, table_name: str, table_schema_nested: Schema): identifier = (database_name, table_name) catalog.create_namespace(namespace=database_name) @@ -153,10 +138,7 @@ def test_drop_table(table_bucket_arn, database_name: str, table_name: str, table catalog.load_table(identifier=identifier) -def test_commit_table(table_bucket_arn, database_name: str, table_name: str, table_schema_nested: Schema): - # setting FileIO to FsspecFileIO explicitly is required as pyarrwo does not work with S3 Table Buckets yet - properties = {"warehouse": table_bucket_arn, "py-io-impl": "pyiceberg.io.fsspec.FsspecFileIO"} - catalog = S3TableCatalog(name="test_s3tables_catalog", **properties) +def test_commit_table(catalog, database_name: str, table_name: str, table_schema_nested: Schema): identifier = (database_name, table_name) catalog.create_namespace(namespace=database_name) From ce305fef2591a5fdf47c90e0a52b420636906738 Mon Sep 17 00:00:00 2001 From: Felix Scherz Date: Mon, 23 Dec 2024 14:37:04 +0100 Subject: [PATCH 23/60] fix: remove defaulting to FsspecIO --- pyiceberg/catalog/s3tables.py | 5 ----- tests/catalog/test_s3tables.py | 2 +- 2 files changed, 1 insertion(+), 6 deletions(-) diff --git a/pyiceberg/catalog/s3tables.py b/pyiceberg/catalog/s3tables.py index 4675a9cf76..b8d4ebf672 100644 --- a/pyiceberg/catalog/s3tables.py +++ b/pyiceberg/catalog/s3tables.py @@ -36,15 +36,10 @@ S3TABLES_ENDPOINT = "s3tables.endpoint" -# pyarrow does not support writing to S3 Table buckets as of 2024-12-14 https://github.com/apache/iceberg-python/issues/1404#issuecomment-2543174146 -S3TABLES_FILE_IO_DEFAULT = "pyiceberg.io.fsspec.FsspecFileIO" - class S3TableCatalog(MetastoreCatalog): def __init__(self, name: str, **properties: str): super().__init__(name, **properties) - # TODO: implement a proper check for FileIO - self.properties[PY_IO_IMPL] = S3TABLES_FILE_IO_DEFAULT self.table_bucket_arn = self.properties[S3TABLES_TABLE_BUCKET_ARN] diff --git a/tests/catalog/test_s3tables.py b/tests/catalog/test_s3tables.py index 67cdeb21c8..c761ab79ad 100644 --- a/tests/catalog/test_s3tables.py +++ b/tests/catalog/test_s3tables.py @@ -32,7 +32,7 @@ def table_bucket_arn(): @pytest.fixture def catalog(table_bucket_arn): - # setting FileIO to FsspecFileIO explicitly is required as pyarrow does not work with S3 Table Buckets yet + # pyarrow does not support writing to S3 Table buckets as of 2024-12-14 https://github.com/apache/iceberg-python/issues/1404#issuecomment-2543174146 properties = {"s3tables.table-bucket-arn": table_bucket_arn, "py-io-impl": "pyiceberg.io.fsspec.FsspecFileIO"} return S3TableCatalog(name="test_s3tables_catalog", **properties) From f5bc5cd664255f182299ba0a8b4ca95e41f14172 Mon Sep 17 00:00:00 2001 From: Felix Scherz Date: Mon, 23 Dec 2024 14:55:12 +0100 Subject: [PATCH 24/60] feat: raise exceptions for invalid namespace/table name --- pyiceberg/catalog/s3tables.py | 31 ++++++++++++------------------- pyiceberg/exceptions.py | 7 +++++++ 2 files changed, 19 insertions(+), 19 deletions(-) diff --git a/pyiceberg/catalog/s3tables.py b/pyiceberg/catalog/s3tables.py index b8d4ebf672..7a3c56ae01 100644 --- a/pyiceberg/catalog/s3tables.py +++ b/pyiceberg/catalog/s3tables.py @@ -4,13 +4,15 @@ import boto3 from boto3.session import UnknownServiceError -from pyiceberg.catalog import DEPRECATED_BOTOCORE_SESSION, WAREHOUSE_LOCATION, MetastoreCatalog, PropertiesUpdateSummary +from pyiceberg.catalog import DEPRECATED_BOTOCORE_SESSION, MetastoreCatalog, PropertiesUpdateSummary from pyiceberg.exceptions import ( CommitFailedException, NamespaceNotEmptyError, NoSuchTableError, TableBucketNotFound, - S3TablesError + S3TablesError, + InvalidNamespaceName, + InvalidTableName, ) from pyiceberg.io import AWS_ACCESS_KEY_ID, AWS_REGION, AWS_SECRET_ACCESS_KEY, AWS_SESSION_TOKEN, PY_IO_IMPL, load_file_io from pyiceberg.partitioning import UNPARTITIONED_PARTITION_SPEC, PartitionSpec @@ -36,6 +38,10 @@ S3TABLES_ENDPOINT = "s3tables.endpoint" +# for naming rules see: https://docs.aws.amazon.com/AmazonS3/latest/userguide/s3-tables-buckets-naming.html +S3TABLES_VALID_NAME_REGEX = pattern = re.compile("[a-z0-9][a-z0-9_]{2,62}") +S3TABLES_RESERVED_NAMESPACE = "aws_s3_metadata" + class S3TableCatalog(MetastoreCatalog): def __init__(self, name: str, **properties: str): @@ -109,33 +115,20 @@ def create_namespace(self, namespace: Union[str, Identifier], properties: Proper self.s3tables.create_namespace(tableBucketARN=self.table_bucket_arn, namespace=[valid_namespace]) def _validate_namespace_identifier(self, namespace: Union[str, Identifier]) -> str: - # for naming rules see: https://docs.aws.amazon.com/AmazonS3/latest/userguide/s3-tables-buckets-naming.html - # TODO: extract into constant variables - pattern = re.compile("[a-z0-9][a-z0-9_]{2,62}") - reserved = "aws_s3_metadata" - namespace = self.identifier_to_database(namespace) - if not pattern.fullmatch(namespace): - ... - - if namespace == reserved: - ... + if not S3TABLES_VALID_NAME_REGEX.fullmatch(namespace) or namespace == S3TABLES_RESERVED_NAMESPACE: + raise InvalidNamespaceName("The specified namespace name is not valid.") return namespace def _validate_database_and_table_identifier(self, identifier: Union[str, Identifier]) -> Tuple[str, str]: - # for naming rules see: https://docs.aws.amazon.com/AmazonS3/latest/userguide/s3-tables-buckets-naming.html - # TODO: extract into constant variables - pattern = re.compile("[a-z0-9][a-z0-9_]{2,62}") - namespace, table_name = self.identifier_to_database_and_table(identifier) namespace = self._validate_namespace_identifier(namespace) - if not pattern.fullmatch(table_name): - # TODO: raise proper errors for invalid table_name - ... + if not S3TABLES_VALID_NAME_REGEX.fullmatch(table_name): + raise InvalidTableName("The specified table name is not valid.") return namespace, table_name diff --git a/pyiceberg/exceptions.py b/pyiceberg/exceptions.py index f633e4c143..245fffd3c1 100644 --- a/pyiceberg/exceptions.py +++ b/pyiceberg/exceptions.py @@ -115,6 +115,13 @@ class GenericDynamoDbError(DynamoDbError): class S3TablesError(Exception): pass +class InvalidNamespaceName(S3TablesError): + pass + + +class InvalidTableName(S3TablesError): + pass + class TableBucketNotFound(S3TablesError): pass From 566636299d7d59f45ff280a6c46466a7e4766998 Mon Sep 17 00:00:00 2001 From: Felix Scherz Date: Sun, 29 Dec 2024 14:48:10 +0100 Subject: [PATCH 25/60] feat: improve error handling for create_table --- pyiceberg/catalog/s3tables.py | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/pyiceberg/catalog/s3tables.py b/pyiceberg/catalog/s3tables.py index 7a3c56ae01..d9ddeff5dc 100644 --- a/pyiceberg/catalog/s3tables.py +++ b/pyiceberg/catalog/s3tables.py @@ -164,17 +164,20 @@ def create_table( io = load_file_io(properties=self.properties, location=metadata_location) # TODO: this triggers unsupported list operation error, setting overwrite=True is a workaround for now - # TODO: we can perform this check manually maybe? self._write_metadata(metadata, io, metadata_location, overwrite=True) - # TODO: after writing need to update table metadata location - # can this operation fail if the version token does not match? - self.s3tables.update_table_metadata_location( - tableBucketARN=self.table_bucket_arn, - namespace=namespace, - name=table_name, - versionToken=version_token, - metadataLocation=metadata_location, - ) + + try: + self.s3tables.update_table_metadata_location( + tableBucketARN=self.table_bucket_arn, + namespace=namespace, + name=table_name, + versionToken=version_token, + metadataLocation=metadata_location, + ) + except self.s3tables.exceptions.ConflictException as e: + raise CommitFailedException( + f"Cannot create {namespace}.{table_name} because of a concurrent update to the table version {version_token}." + ) from e return self.load_table(identifier=identifier) From 38f907f644ff700fcd826334019278c384e01cba Mon Sep 17 00:00:00 2001 From: Felix Scherz Date: Sun, 29 Dec 2024 14:49:24 +0100 Subject: [PATCH 26/60] feat: improve error handling for delete_table --- pyiceberg/catalog/s3tables.py | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/pyiceberg/catalog/s3tables.py b/pyiceberg/catalog/s3tables.py index d9ddeff5dc..92c8f78077 100644 --- a/pyiceberg/catalog/s3tables.py +++ b/pyiceberg/catalog/s3tables.py @@ -195,13 +195,18 @@ def drop_table(self, identifier: Union[str, Identifier]) -> None: except self.s3tables.exceptions.NotFoundException as e: raise NoSuchTableError(f"No table with identifier {identifier} exists.") from e - # TODO: handle conflicts due to versionToken mismatch that might occur - self.s3tables.delete_table( - tableBucketARN=self.table_bucket_arn, - namespace=namespace, - name=table_name, - versionToken=response["versionToken"], - ) + version_token = response["versionToken"] + try: + self.s3tables.delete_table( + tableBucketARN=self.table_bucket_arn, + namespace=namespace, + name=table_name, + versionToken=version_token, + ) + except self.s3tables.exceptions.ConflictException as e: + raise CommitFailedException( + f"Cannot delete {namespace}.{table_name} because of a concurrent update to the table version {version_token}." + ) from e def list_namespaces(self, namespace: Union[str, Identifier] = ()) -> List[Identifier]: # TODO: s3tables only support single level namespaces From db4303dd45e30a5b0d4cc5f962884d9e255539e8 Mon Sep 17 00:00:00 2001 From: Felix Scherz Date: Sun, 29 Dec 2024 14:50:49 +0100 Subject: [PATCH 27/60] chore: cleanup comments --- pyiceberg/catalog/s3tables.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pyiceberg/catalog/s3tables.py b/pyiceberg/catalog/s3tables.py index 92c8f78077..b95f2ba8d2 100644 --- a/pyiceberg/catalog/s3tables.py +++ b/pyiceberg/catalog/s3tables.py @@ -209,7 +209,6 @@ def drop_table(self, identifier: Union[str, Identifier]) -> None: ) from e def list_namespaces(self, namespace: Union[str, Identifier] = ()) -> List[Identifier]: - # TODO: s3tables only support single level namespaces if namespace: namespace = self._validate_namespace_identifier(namespace) paginator = self.s3tables.get_paginator("list_namespaces") @@ -240,7 +239,7 @@ def load_namespace_properties(self, namespace: Union[str, Identifier]) -> Proper def load_table(self, identifier: Union[str, Identifier]) -> Table: namespace, table_name = self._validate_database_and_table_identifier(identifier) - # TODO: raise a NoSuchTableError if it does not exist + try: response = self.s3tables.get_table_metadata_location( tableBucketARN=self.table_bucket_arn, namespace=namespace, name=table_name From 6b8bfd0c4f7157f318e64747a31742cb838f2d27 Mon Sep 17 00:00:00 2001 From: Felix Scherz Date: Sun, 29 Dec 2024 14:55:16 +0100 Subject: [PATCH 28/60] feat: catch missing metadata for load_table --- pyiceberg/catalog/s3tables.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/pyiceberg/catalog/s3tables.py b/pyiceberg/catalog/s3tables.py index b95f2ba8d2..077d93d7d0 100644 --- a/pyiceberg/catalog/s3tables.py +++ b/pyiceberg/catalog/s3tables.py @@ -246,8 +246,10 @@ def load_table(self, identifier: Union[str, Identifier]) -> Table: ) except self.s3tables.exceptions.NotFoundException as e: raise NoSuchTableError(f"No table with identifier {identifier} exists.") from e - # TODO: we might need to catch if table is not initialized i.e. does not have metadata setup yet - metadata_location = response["metadataLocation"] + + metadata_location = response.get("metadataLocation") + if not metadata_location: + raise S3TablesError("No table metadata found.") io = load_file_io(properties=self.properties, location=metadata_location) file = io.new_input(metadata_location) From a8ef69fbb2ad128c5093bee608d4f6bb72df3172 Mon Sep 17 00:00:00 2001 From: Felix Scherz Date: Sun, 29 Dec 2024 16:02:17 +0100 Subject: [PATCH 29/60] feat: handle missing namespace and preexisting table --- pyiceberg/catalog/s3tables.py | 15 ++++++++++++--- tests/catalog/test_s3tables.py | 19 ++++++++++++++++++- 2 files changed, 30 insertions(+), 4 deletions(-) diff --git a/pyiceberg/catalog/s3tables.py b/pyiceberg/catalog/s3tables.py index 077d93d7d0..6ae2db3f32 100644 --- a/pyiceberg/catalog/s3tables.py +++ b/pyiceberg/catalog/s3tables.py @@ -13,6 +13,8 @@ S3TablesError, InvalidNamespaceName, InvalidTableName, + TableAlreadyExistsError, + NoSuchNamespaceError ) from pyiceberg.io import AWS_ACCESS_KEY_ID, AWS_REGION, AWS_SECRET_ACCESS_KEY, AWS_SESSION_TOKEN, PY_IO_IMPL, load_file_io from pyiceberg.partitioning import UNPARTITIONED_PARTITION_SPEC, PartitionSpec @@ -145,10 +147,17 @@ def create_table( schema: Schema = self._convert_schema_if_needed(schema) # type: ignore - # TODO: check whether namespace exists and if it does, whether table_name already exists - self.s3tables.create_table(tableBucketARN=self.table_bucket_arn, namespace=namespace, name=table_name, format="ICEBERG") + try: + self.s3tables.create_table(tableBucketARN=self.table_bucket_arn, namespace=namespace, name=table_name, format="ICEBERG") + except self.s3tables.exceptions.NotFoundException as e: + raise NoSuchNamespaceError( + f"Cannot create {namespace}.{table_name} because no such namespace exists." + ) from e + except self.s3tables.exceptions.ConflictException as e: + raise TableAlreadyExistsError( + f"Cannot create {namespace}.{table_name} because a table of the same name already exists in the namespace." + ) from e - # location is given by s3 table bucket response = self.s3tables.get_table(tableBucketARN=self.table_bucket_arn, namespace=namespace, name=table_name) version_token = response["versionToken"] diff --git a/tests/catalog/test_s3tables.py b/tests/catalog/test_s3tables.py index c761ab79ad..cd764737f2 100644 --- a/tests/catalog/test_s3tables.py +++ b/tests/catalog/test_s3tables.py @@ -4,7 +4,7 @@ import pytest from pyiceberg.catalog.s3tables import S3TableCatalog -from pyiceberg.exceptions import NoSuchTableError, TableBucketNotFound +from pyiceberg.exceptions import NoSuchTableError, TableBucketNotFound, NoSuchNamespaceError from pyiceberg.schema import Schema from pyiceberg.types import IntegerType @@ -59,6 +59,14 @@ def test_s3tables_api_raises_on_conflicting_version_tokens(table_bucket_arn, dat ) +def test_s3tables_api_raises_on_preexisting_table(table_bucket_arn, database_name, table_name): + client = boto3.client("s3tables") + client.create_namespace(tableBucketARN=table_bucket_arn, namespace=[database_name]) + client.create_table(tableBucketARN=table_bucket_arn, namespace=database_name, name=table_name, format="ICEBERG") + with pytest.raises(client.exceptions.ConflictException): + client.create_table(tableBucketARN=table_bucket_arn, namespace=database_name, name=table_name, format="ICEBERG") + + def test_creating_catalog_validates_s3_table_bucket_exists(table_bucket_arn): properties = {"s3tables.table-bucket-arn": f"{table_bucket_arn}-modified"} with pytest.raises(TableBucketNotFound): @@ -92,6 +100,15 @@ def test_create_table(catalog, database_name: str, table_name: str, table_schema assert table == catalog.load_table(identifier) +def test_create_table_in_invalid_namespace_raises_exception(catalog, database_name: str, table_name: str, table_schema_nested: Schema): + identifier = (database_name, table_name) + + with pytest.raises(NoSuchNamespaceError): + catalog.create_table(identifier=identifier, schema=table_schema_nested) + + + + def test_table_exists(catalog, database_name: str, table_name: str, table_schema_nested: Schema): identifier = (database_name, table_name) From 8833fcfa754b0a8417fd38b9ddd11f087f6f054e Mon Sep 17 00:00:00 2001 From: Felix Scherz Date: Sun, 29 Dec 2024 16:12:21 +0100 Subject: [PATCH 30/60] feat: handle versionToken and table in an atomic operation --- pyiceberg/catalog/s3tables.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/pyiceberg/catalog/s3tables.py b/pyiceberg/catalog/s3tables.py index 6ae2db3f32..e8bbebe960 100644 --- a/pyiceberg/catalog/s3tables.py +++ b/pyiceberg/catalog/s3tables.py @@ -75,13 +75,7 @@ def commit_table( table_identifier = table.name() database_name, table_name = self.identifier_to_database_and_table(table_identifier, NoSuchTableError) - # TODO: loading table and getting versionToken should be an atomic operation, otherwise - # the table can change between those two API calls without noticing - # -> change this into an internal API that returns table information along with versionToken - current_table = self.load_table(identifier=table_identifier) - version_token = self.s3tables.get_table(tableBucketARN=self.table_bucket_arn, namespace=database_name, name=table_name)[ - "versionToken" - ] + current_table, version_token = self._load_table_and_version(identifier=table_identifier) updated_staged_table = self._update_and_stage_table(current_table, table_identifier, requirements, updates) if current_table and updated_staged_table.metadata == current_table.metadata: @@ -172,7 +166,7 @@ def create_table( ) io = load_file_io(properties=self.properties, location=metadata_location) - # TODO: this triggers unsupported list operation error, setting overwrite=True is a workaround for now + # this triggers unsupported list operation error, setting overwrite=True is a workaround for now self._write_metadata(metadata, io, metadata_location, overwrite=True) try: @@ -247,6 +241,10 @@ def load_namespace_properties(self, namespace: Union[str, Identifier]) -> Proper } def load_table(self, identifier: Union[str, Identifier]) -> Table: + table, _ = self._load_table_and_version(identifier) + return table + + def _load_table_and_version(self, identifier: Union[str, Identifier]) -> Tuple[Table, str]: namespace, table_name = self._validate_database_and_table_identifier(identifier) try: @@ -260,6 +258,8 @@ def load_table(self, identifier: Union[str, Identifier]) -> Table: if not metadata_location: raise S3TablesError("No table metadata found.") + version_token = response["versionToken"] + io = load_file_io(properties=self.properties, location=metadata_location) file = io.new_input(metadata_location) metadata = FromInputFile.table_metadata(file) @@ -269,7 +269,7 @@ def load_table(self, identifier: Union[str, Identifier]) -> Table: metadata_location=metadata_location, io=self._load_file_io(metadata.properties, metadata_location), catalog=self, - ) + ), version_token def rename_table(self, from_identifier: Union[str, Identifier], to_identifier: Union[str, Identifier]) -> Table: from_namespace, from_table_name = self._validate_database_and_table_identifier(from_identifier) From 7491b621c87a349993d488120657a2c47cb5c017 Mon Sep 17 00:00:00 2001 From: Felix Scherz Date: Sun, 29 Dec 2024 16:13:00 +0100 Subject: [PATCH 31/60] chore: run formatter --- pyiceberg/catalog/s3tables.py | 18 +++++++++--------- pyiceberg/exceptions.py | 1 + tests/catalog/test_s3tables.py | 9 +++++---- 3 files changed, 15 insertions(+), 13 deletions(-) diff --git a/pyiceberg/catalog/s3tables.py b/pyiceberg/catalog/s3tables.py index e8bbebe960..85971d9c4b 100644 --- a/pyiceberg/catalog/s3tables.py +++ b/pyiceberg/catalog/s3tables.py @@ -7,16 +7,16 @@ from pyiceberg.catalog import DEPRECATED_BOTOCORE_SESSION, MetastoreCatalog, PropertiesUpdateSummary from pyiceberg.exceptions import ( CommitFailedException, + InvalidNamespaceName, + InvalidTableName, NamespaceNotEmptyError, + NoSuchNamespaceError, NoSuchTableError, - TableBucketNotFound, S3TablesError, - InvalidNamespaceName, - InvalidTableName, TableAlreadyExistsError, - NoSuchNamespaceError + TableBucketNotFound, ) -from pyiceberg.io import AWS_ACCESS_KEY_ID, AWS_REGION, AWS_SECRET_ACCESS_KEY, AWS_SESSION_TOKEN, PY_IO_IMPL, load_file_io +from pyiceberg.io import AWS_ACCESS_KEY_ID, AWS_REGION, AWS_SECRET_ACCESS_KEY, AWS_SESSION_TOKEN, load_file_io from pyiceberg.partitioning import UNPARTITIONED_PARTITION_SPEC, PartitionSpec from pyiceberg.schema import Schema from pyiceberg.serializers import FromInputFile @@ -142,11 +142,11 @@ def create_table( schema: Schema = self._convert_schema_if_needed(schema) # type: ignore try: - self.s3tables.create_table(tableBucketARN=self.table_bucket_arn, namespace=namespace, name=table_name, format="ICEBERG") + self.s3tables.create_table( + tableBucketARN=self.table_bucket_arn, namespace=namespace, name=table_name, format="ICEBERG" + ) except self.s3tables.exceptions.NotFoundException as e: - raise NoSuchNamespaceError( - f"Cannot create {namespace}.{table_name} because no such namespace exists." - ) from e + raise NoSuchNamespaceError(f"Cannot create {namespace}.{table_name} because no such namespace exists.") from e except self.s3tables.exceptions.ConflictException as e: raise TableAlreadyExistsError( f"Cannot create {namespace}.{table_name} because a table of the same name already exists in the namespace." diff --git a/pyiceberg/exceptions.py b/pyiceberg/exceptions.py index 245fffd3c1..8367d91b79 100644 --- a/pyiceberg/exceptions.py +++ b/pyiceberg/exceptions.py @@ -115,6 +115,7 @@ class GenericDynamoDbError(DynamoDbError): class S3TablesError(Exception): pass + class InvalidNamespaceName(S3TablesError): pass diff --git a/tests/catalog/test_s3tables.py b/tests/catalog/test_s3tables.py index cd764737f2..d379e73b82 100644 --- a/tests/catalog/test_s3tables.py +++ b/tests/catalog/test_s3tables.py @@ -4,7 +4,7 @@ import pytest from pyiceberg.catalog.s3tables import S3TableCatalog -from pyiceberg.exceptions import NoSuchTableError, TableBucketNotFound, NoSuchNamespaceError +from pyiceberg.exceptions import NoSuchNamespaceError, NoSuchTableError, TableBucketNotFound from pyiceberg.schema import Schema from pyiceberg.types import IntegerType @@ -30,6 +30,7 @@ def table_bucket_arn(): return os.environ["ARN"] + @pytest.fixture def catalog(table_bucket_arn): # pyarrow does not support writing to S3 Table buckets as of 2024-12-14 https://github.com/apache/iceberg-python/issues/1404#issuecomment-2543174146 @@ -100,15 +101,15 @@ def test_create_table(catalog, database_name: str, table_name: str, table_schema assert table == catalog.load_table(identifier) -def test_create_table_in_invalid_namespace_raises_exception(catalog, database_name: str, table_name: str, table_schema_nested: Schema): +def test_create_table_in_invalid_namespace_raises_exception( + catalog, database_name: str, table_name: str, table_schema_nested: Schema +): identifier = (database_name, table_name) with pytest.raises(NoSuchNamespaceError): catalog.create_table(identifier=identifier, schema=table_schema_nested) - - def test_table_exists(catalog, database_name: str, table_name: str, table_schema_nested: Schema): identifier = (database_name, table_name) From 4640492f10eca06e8411eeffa75df0cd3aeba20d Mon Sep 17 00:00:00 2001 From: Felix Scherz Date: Sun, 29 Dec 2024 16:17:09 +0100 Subject: [PATCH 32/60] chore: add type hints for tests --- pyiceberg/catalog/s3tables.py | 15 +++++++------- tests/catalog/test_s3tables.py | 36 +++++++++++++++++----------------- 2 files changed, 25 insertions(+), 26 deletions(-) diff --git a/pyiceberg/catalog/s3tables.py b/pyiceberg/catalog/s3tables.py index 85971d9c4b..fae82b0ee8 100644 --- a/pyiceberg/catalog/s3tables.py +++ b/pyiceberg/catalog/s3tables.py @@ -2,7 +2,6 @@ from typing import TYPE_CHECKING, List, Optional, Set, Tuple, Union import boto3 -from boto3.session import UnknownServiceError from pyiceberg.catalog import DEPRECATED_BOTOCORE_SESSION, MetastoreCatalog, PropertiesUpdateSummary from pyiceberg.exceptions import ( @@ -20,10 +19,10 @@ from pyiceberg.partitioning import UNPARTITIONED_PARTITION_SPEC, PartitionSpec from pyiceberg.schema import Schema from pyiceberg.serializers import FromInputFile -from pyiceberg.table import CommitTableResponse, Table, TableRequirement +from pyiceberg.table import CommitTableResponse, Table from pyiceberg.table.metadata import new_table_metadata from pyiceberg.table.sorting import UNSORTED_SORT_ORDER, SortOrder -from pyiceberg.table.update import TableUpdate +from pyiceberg.table.update import TableRequirement, TableUpdate from pyiceberg.typedef import EMPTY_DICT, Identifier, Properties from pyiceberg.utils.properties import get_first_property_value @@ -61,7 +60,7 @@ def __init__(self, name: str, **properties: str): ) try: self.s3tables = session.client("s3tables", endpoint_url=properties.get(S3TABLES_ENDPOINT)) - except UnknownServiceError as e: + except boto3.session.UnknownServiceError as e: raise S3TablesError("'s3tables' requires boto3>=1.35.74. Current version: {boto3.__version__}.") from e try: @@ -106,7 +105,7 @@ def commit_table( metadata=updated_staged_table.metadata, metadata_location=updated_staged_table.metadata_location ) - def create_namespace(self, namespace: Union[str, Identifier], properties: Properties = ...) -> None: + def create_namespace(self, namespace: Union[str, Identifier], properties: Properties = EMPTY_DICT) -> None: valid_namespace: str = self._validate_namespace_identifier(namespace) self.s3tables.create_namespace(tableBucketARN=self.table_bucket_arn, namespace=[valid_namespace]) @@ -155,10 +154,10 @@ def create_table( response = self.s3tables.get_table(tableBucketARN=self.table_bucket_arn, namespace=namespace, name=table_name) version_token = response["versionToken"] - location = response["warehouseLocation"] - metadata_location = self._get_metadata_location(location=location) + warehouse_location = response["warehouseLocation"] + metadata_location = self._get_metadata_location(location=warehouse_location) metadata = new_table_metadata( - location=location, + location=warehouse_location, schema=schema, partition_spec=partition_spec, sort_order=sort_order, diff --git a/tests/catalog/test_s3tables.py b/tests/catalog/test_s3tables.py index d379e73b82..101eae485e 100644 --- a/tests/catalog/test_s3tables.py +++ b/tests/catalog/test_s3tables.py @@ -10,19 +10,19 @@ @pytest.fixture -def database_name(database_name): +def database_name(database_name: str) -> str: # naming rules prevent "-" in namespaces for s3 table buckets return database_name.replace("-", "_") @pytest.fixture -def table_name(table_name): +def table_name(table_name: str) -> str: # naming rules prevent "-" in table namees for s3 table buckets return table_name.replace("-", "_") @pytest.fixture -def table_bucket_arn(): +def table_bucket_arn() -> str: import os # since the moto library does not support s3tables as of 2024-12-14 we have to test against a real AWS endpoint @@ -32,13 +32,13 @@ def table_bucket_arn(): @pytest.fixture -def catalog(table_bucket_arn): +def catalog(table_bucket_arn: str) -> S3TableCatalog: # pyarrow does not support writing to S3 Table buckets as of 2024-12-14 https://github.com/apache/iceberg-python/issues/1404#issuecomment-2543174146 properties = {"s3tables.table-bucket-arn": table_bucket_arn, "py-io-impl": "pyiceberg.io.fsspec.FsspecFileIO"} return S3TableCatalog(name="test_s3tables_catalog", **properties) -def test_s3tables_api_raises_on_conflicting_version_tokens(table_bucket_arn, database_name, table_name): +def test_s3tables_api_raises_on_conflicting_version_tokens(table_bucket_arn: str, database_name: str, table_name: str) -> None: client = boto3.client("s3tables") client.create_namespace(tableBucketARN=table_bucket_arn, namespace=[database_name]) response = client.create_table(tableBucketARN=table_bucket_arn, namespace=database_name, name=table_name, format="ICEBERG") @@ -60,7 +60,7 @@ def test_s3tables_api_raises_on_conflicting_version_tokens(table_bucket_arn, dat ) -def test_s3tables_api_raises_on_preexisting_table(table_bucket_arn, database_name, table_name): +def test_s3tables_api_raises_on_preexisting_table(table_bucket_arn: str, database_name: str, table_name: str) -> None: client = boto3.client("s3tables") client.create_namespace(tableBucketARN=table_bucket_arn, namespace=[database_name]) client.create_table(tableBucketARN=table_bucket_arn, namespace=database_name, name=table_name, format="ICEBERG") @@ -68,31 +68,31 @@ def test_s3tables_api_raises_on_preexisting_table(table_bucket_arn, database_nam client.create_table(tableBucketARN=table_bucket_arn, namespace=database_name, name=table_name, format="ICEBERG") -def test_creating_catalog_validates_s3_table_bucket_exists(table_bucket_arn): +def test_creating_catalog_validates_s3_table_bucket_exists(table_bucket_arn: str) -> None: properties = {"s3tables.table-bucket-arn": f"{table_bucket_arn}-modified"} with pytest.raises(TableBucketNotFound): S3TableCatalog(name="test_s3tables_catalog", **properties) -def test_create_namespace(catalog, database_name: str): +def test_create_namespace(catalog: S3TableCatalog, database_name: str) -> None: catalog.create_namespace(namespace=database_name) namespaces = catalog.list_namespaces() assert (database_name,) in namespaces -def test_load_namespace_properties(catalog, database_name: str): +def test_load_namespace_properties(catalog: S3TableCatalog, database_name: str) -> None: catalog.create_namespace(namespace=database_name) assert database_name in catalog.load_namespace_properties(database_name)["namespace"] -def test_drop_namespace(catalog, database_name: str): +def test_drop_namespace(catalog: S3TableCatalog, database_name: str) -> None: catalog.create_namespace(namespace=database_name) assert (database_name,) in catalog.list_namespaces() catalog.drop_namespace(namespace=database_name) assert (database_name,) not in catalog.list_namespaces() -def test_create_table(catalog, database_name: str, table_name: str, table_schema_nested: Schema): +def test_create_table(catalog: S3TableCatalog, database_name: str, table_name: str, table_schema_nested: Schema) -> None: identifier = (database_name, table_name) catalog.create_namespace(namespace=database_name) @@ -102,15 +102,15 @@ def test_create_table(catalog, database_name: str, table_name: str, table_schema def test_create_table_in_invalid_namespace_raises_exception( - catalog, database_name: str, table_name: str, table_schema_nested: Schema -): + catalog: S3TableCatalog, database_name: str, table_name: str, table_schema_nested: Schema +) -> None: identifier = (database_name, table_name) with pytest.raises(NoSuchNamespaceError): catalog.create_table(identifier=identifier, schema=table_schema_nested) -def test_table_exists(catalog, database_name: str, table_name: str, table_schema_nested: Schema): +def test_table_exists(catalog: S3TableCatalog, database_name: str, table_name: str, table_schema_nested: Schema) -> None: identifier = (database_name, table_name) catalog.create_namespace(namespace=database_name) @@ -119,7 +119,7 @@ def test_table_exists(catalog, database_name: str, table_name: str, table_schema assert catalog.table_exists(identifier=identifier) -def test_rename_table(catalog, database_name: str, table_name: str, table_schema_nested: Schema): +def test_rename_table(catalog: S3TableCatalog, database_name: str, table_name: str, table_schema_nested: Schema) -> None: identifier = (database_name, table_name) catalog.create_namespace(namespace=database_name) @@ -135,7 +135,7 @@ def test_rename_table(catalog, database_name: str, table_name: str, table_schema assert catalog.table_exists(identifier=to_identifier) -def test_list_tables(catalog, database_name: str, table_name: str, table_schema_nested: Schema): +def test_list_tables(catalog: S3TableCatalog, database_name: str, table_name: str, table_schema_nested: Schema) -> None: identifier = (database_name, table_name) catalog.create_namespace(namespace=database_name) @@ -144,7 +144,7 @@ def test_list_tables(catalog, database_name: str, table_name: str, table_schema_ assert catalog.list_tables(namespace=database_name) -def test_drop_table(catalog, database_name: str, table_name: str, table_schema_nested: Schema): +def test_drop_table(catalog: S3TableCatalog, database_name: str, table_name: str, table_schema_nested: Schema) -> None: identifier = (database_name, table_name) catalog.create_namespace(namespace=database_name) @@ -156,7 +156,7 @@ def test_drop_table(catalog, database_name: str, table_name: str, table_schema_n catalog.load_table(identifier=identifier) -def test_commit_table(catalog, database_name: str, table_name: str, table_schema_nested: Schema): +def test_commit_table(catalog: S3TableCatalog, database_name: str, table_name: str, table_schema_nested: Schema) -> None: identifier = (database_name, table_name) catalog.create_namespace(namespace=database_name) From 1c7aeb7aa4a7c3d5cbfe50e0778f7ee849b01995 Mon Sep 17 00:00:00 2001 From: Felix Scherz Date: Sat, 4 Jan 2025 13:29:37 +0100 Subject: [PATCH 33/60] fix: no longer enforce FsspecFileIO --- tests/catalog/test_s3tables.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tests/catalog/test_s3tables.py b/tests/catalog/test_s3tables.py index 101eae485e..76663de25a 100644 --- a/tests/catalog/test_s3tables.py +++ b/tests/catalog/test_s3tables.py @@ -33,8 +33,7 @@ def table_bucket_arn() -> str: @pytest.fixture def catalog(table_bucket_arn: str) -> S3TableCatalog: - # pyarrow does not support writing to S3 Table buckets as of 2024-12-14 https://github.com/apache/iceberg-python/issues/1404#issuecomment-2543174146 - properties = {"s3tables.table-bucket-arn": table_bucket_arn, "py-io-impl": "pyiceberg.io.fsspec.FsspecFileIO"} + properties = {"s3tables.table-bucket-arn": table_bucket_arn} return S3TableCatalog(name="test_s3tables_catalog", **properties) From 2bafb1a9d324df03da80cd5af21954a8124fac3f Mon Sep 17 00:00:00 2001 From: Felix Scherz Date: Sat, 4 Jan 2025 13:37:01 +0100 Subject: [PATCH 34/60] test: remove tests for boto3 behavior --- tests/catalog/test_s3tables.py | 30 ------------------------------ 1 file changed, 30 deletions(-) diff --git a/tests/catalog/test_s3tables.py b/tests/catalog/test_s3tables.py index 76663de25a..d9f05df430 100644 --- a/tests/catalog/test_s3tables.py +++ b/tests/catalog/test_s3tables.py @@ -37,36 +37,6 @@ def catalog(table_bucket_arn: str) -> S3TableCatalog: return S3TableCatalog(name="test_s3tables_catalog", **properties) -def test_s3tables_api_raises_on_conflicting_version_tokens(table_bucket_arn: str, database_name: str, table_name: str) -> None: - client = boto3.client("s3tables") - client.create_namespace(tableBucketARN=table_bucket_arn, namespace=[database_name]) - response = client.create_table(tableBucketARN=table_bucket_arn, namespace=database_name, name=table_name, format="ICEBERG") - version_token = response["versionToken"] - scrambled_version_token = version_token[::-1] - - warehouse_location = client.get_table(tableBucketARN=table_bucket_arn, namespace=database_name, name=table_name)[ - "warehouseLocation" - ] - metadata_location = f"{warehouse_location}/metadata/00001-{uuid.uuid4()}.metadata.json" - - with pytest.raises(client.exceptions.ConflictException): - client.update_table_metadata_location( - tableBucketARN=table_bucket_arn, - namespace=database_name, - name=table_name, - versionToken=scrambled_version_token, - metadataLocation=metadata_location, - ) - - -def test_s3tables_api_raises_on_preexisting_table(table_bucket_arn: str, database_name: str, table_name: str) -> None: - client = boto3.client("s3tables") - client.create_namespace(tableBucketARN=table_bucket_arn, namespace=[database_name]) - client.create_table(tableBucketARN=table_bucket_arn, namespace=database_name, name=table_name, format="ICEBERG") - with pytest.raises(client.exceptions.ConflictException): - client.create_table(tableBucketARN=table_bucket_arn, namespace=database_name, name=table_name, format="ICEBERG") - - def test_creating_catalog_validates_s3_table_bucket_exists(table_bucket_arn: str) -> None: properties = {"s3tables.table-bucket-arn": f"{table_bucket_arn}-modified"} with pytest.raises(TableBucketNotFound): From 0937f3ef995b999a2d49b072ac4e1cf4e9926a77 Mon Sep 17 00:00:00 2001 From: Felix Scherz Date: Sat, 4 Jan 2025 13:44:14 +0100 Subject: [PATCH 35/60] test: verify column was created on commit --- tests/catalog/test_s3tables.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/catalog/test_s3tables.py b/tests/catalog/test_s3tables.py index d9f05df430..15d9ba57ed 100644 --- a/tests/catalog/test_s3tables.py +++ b/tests/catalog/test_s3tables.py @@ -147,3 +147,4 @@ def test_commit_table(catalog: S3TableCatalog, database_name: str, table_name: s assert updated_table_metadata.last_updated_ms > last_updated_ms assert updated_table_metadata.metadata_log[0].metadata_file == original_table_metadata_location assert updated_table_metadata.metadata_log[0].timestamp_ms == original_table_last_updated_ms + assert table.schema().columns[-1].name == "b" From 848bc07a5884ca88a103a0a369e4647bacc093cd Mon Sep 17 00:00:00 2001 From: Felix Scherz Date: Sat, 4 Jan 2025 13:57:59 +0100 Subject: [PATCH 36/60] test: verify new data can be committed to table --- tests/catalog/test_s3tables.py | 31 +++++++++++++++++++++++++++---- 1 file changed, 27 insertions(+), 4 deletions(-) diff --git a/tests/catalog/test_s3tables.py b/tests/catalog/test_s3tables.py index 15d9ba57ed..f97371540b 100644 --- a/tests/catalog/test_s3tables.py +++ b/tests/catalog/test_s3tables.py @@ -1,6 +1,3 @@ -import uuid - -import boto3 import pytest from pyiceberg.catalog.s3tables import S3TableCatalog @@ -125,7 +122,9 @@ def test_drop_table(catalog: S3TableCatalog, database_name: str, table_name: str catalog.load_table(identifier=identifier) -def test_commit_table(catalog: S3TableCatalog, database_name: str, table_name: str, table_schema_nested: Schema) -> None: +def test_commit_new_column_to_table( + catalog: S3TableCatalog, database_name: str, table_name: str, table_schema_nested: Schema +) -> None: identifier = (database_name, table_name) catalog.create_namespace(namespace=database_name) @@ -148,3 +147,27 @@ def test_commit_table(catalog: S3TableCatalog, database_name: str, table_name: s assert updated_table_metadata.metadata_log[0].metadata_file == original_table_metadata_location assert updated_table_metadata.metadata_log[0].timestamp_ms == original_table_last_updated_ms assert table.schema().columns[-1].name == "b" + + +def test_commit_new_data_to_table( + catalog: S3TableCatalog, database_name: str, table_name: str, table_schema_nested: Schema +) -> None: + identifier = (database_name, table_name) + + catalog.create_namespace(namespace=database_name) + table = catalog.create_table(identifier=identifier, schema=table_schema_nested) + + row_count = table.scan().to_arrow().num_rows + last_updated_ms = table.metadata.last_updated_ms + original_table_metadata_location = table.metadata_location + original_table_last_updated_ms = table.metadata.last_updated_ms + + transaction = table.transaction() + transaction.append(table.scan().to_arrow()) + transaction.commit_transaction() + + updated_table_metadata = table.metadata + assert updated_table_metadata.last_updated_ms > last_updated_ms + assert updated_table_metadata.metadata_log[0].metadata_file == original_table_metadata_location + assert updated_table_metadata.metadata_log[0].timestamp_ms == original_table_last_updated_ms + assert table.scan().to_arrow().num_rows == 2 * row_count From b18601a0e0cd031773575d462077a2574db62a12 Mon Sep 17 00:00:00 2001 From: Felix Scherz Date: Sun, 5 Jan 2025 17:11:33 +0100 Subject: [PATCH 37/60] docs: update documentation for create_table --- pyiceberg/catalog/s3tables.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/pyiceberg/catalog/s3tables.py b/pyiceberg/catalog/s3tables.py index fae82b0ee8..0485bb3072 100644 --- a/pyiceberg/catalog/s3tables.py +++ b/pyiceberg/catalog/s3tables.py @@ -140,6 +140,9 @@ def create_table( schema: Schema = self._convert_schema_if_needed(schema) # type: ignore + # creating a new table with S3 Tables is a two step process. We first have to create an S3 Table with the + # S3 Tables API and then write the new metadata.json to the warehouseLocaiton associated with the newly + # created S3 Table. try: self.s3tables.create_table( tableBucketARN=self.table_bucket_arn, namespace=namespace, name=table_name, format="ICEBERG" @@ -165,7 +168,8 @@ def create_table( ) io = load_file_io(properties=self.properties, location=metadata_location) - # this triggers unsupported list operation error, setting overwrite=True is a workaround for now + # this triggers unsupported list operation error as S3 Table Buckets only support a subset of the S3 Bucket API, + # setting overwrite=True is a workaround for now since it prevents a call to list_objects self._write_metadata(metadata, io, metadata_location, overwrite=True) try: From 69c185634a1e8ea2b66ba63937a6ce80a37fde73 Mon Sep 17 00:00:00 2001 From: Felix Scherz Date: Sun, 5 Jan 2025 17:25:29 +0100 Subject: [PATCH 38/60] test: set AWS regions explicitly --- tests/catalog/test_s3tables.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/catalog/test_s3tables.py b/tests/catalog/test_s3tables.py index f97371540b..930b11ec9a 100644 --- a/tests/catalog/test_s3tables.py +++ b/tests/catalog/test_s3tables.py @@ -30,12 +30,12 @@ def table_bucket_arn() -> str: @pytest.fixture def catalog(table_bucket_arn: str) -> S3TableCatalog: - properties = {"s3tables.table-bucket-arn": table_bucket_arn} + properties = {"s3tables.table-bucket-arn": table_bucket_arn, "s3tables.region": "us-east-1", "s3.region": "us-east-1"} return S3TableCatalog(name="test_s3tables_catalog", **properties) def test_creating_catalog_validates_s3_table_bucket_exists(table_bucket_arn: str) -> None: - properties = {"s3tables.table-bucket-arn": f"{table_bucket_arn}-modified"} + properties = {"s3tables.table-bucket-arn": f"{table_bucket_arn}-modified", "s3tables.region": "us-east-1"} with pytest.raises(TableBucketNotFound): S3TableCatalog(name="test_s3tables_catalog", **properties) From 6de777e2d8ed8f39ed777a7ad8d68c6692016308 Mon Sep 17 00:00:00 2001 From: Felix Scherz Date: Mon, 6 Jan 2025 17:19:43 +0100 Subject: [PATCH 39/60] Apply suggestions from code review Co-authored-by: Kevin Liu --- pyiceberg/catalog/s3tables.py | 2 +- tests/catalog/test_s3tables.py | 8 ++++++-- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/pyiceberg/catalog/s3tables.py b/pyiceberg/catalog/s3tables.py index 0485bb3072..1408322ea7 100644 --- a/pyiceberg/catalog/s3tables.py +++ b/pyiceberg/catalog/s3tables.py @@ -141,7 +141,7 @@ def create_table( schema: Schema = self._convert_schema_if_needed(schema) # type: ignore # creating a new table with S3 Tables is a two step process. We first have to create an S3 Table with the - # S3 Tables API and then write the new metadata.json to the warehouseLocaiton associated with the newly + # S3 Tables API and then write the new metadata.json to the warehouseLocation associated with the newly # created S3 Table. try: self.s3tables.create_table( diff --git a/tests/catalog/test_s3tables.py b/tests/catalog/test_s3tables.py index 930b11ec9a..d9f48e363a 100644 --- a/tests/catalog/test_s3tables.py +++ b/tests/catalog/test_s3tables.py @@ -26,11 +26,15 @@ def table_bucket_arn() -> str: # in one of the supported regions. return os.environ["ARN"] +@pytest.fixture +def aws_region() -> str: + import os + return os.environ["AWS_REGION"] @pytest.fixture -def catalog(table_bucket_arn: str) -> S3TableCatalog: - properties = {"s3tables.table-bucket-arn": table_bucket_arn, "s3tables.region": "us-east-1", "s3.region": "us-east-1"} +def catalog(table_bucket_arn: str, aws_region: str) -> S3TableCatalog: + properties = {"s3tables.table-bucket-arn": table_bucket_arn, "s3tables.region": aws_region} return S3TableCatalog(name="test_s3tables_catalog", **properties) From 20f09efcd7448cbba4bb637fbd3d3217f0859fba Mon Sep 17 00:00:00 2001 From: Felix Scherz Date: Mon, 6 Jan 2025 17:52:34 +0100 Subject: [PATCH 40/60] test: commit new data to table --- tests/catalog/test_s3tables.py | 62 ++++++++++++++++++++++++++++++---- 1 file changed, 56 insertions(+), 6 deletions(-) diff --git a/tests/catalog/test_s3tables.py b/tests/catalog/test_s3tables.py index d9f48e363a..d1f6e6ffd1 100644 --- a/tests/catalog/test_s3tables.py +++ b/tests/catalog/test_s3tables.py @@ -26,12 +26,15 @@ def table_bucket_arn() -> str: # in one of the supported regions. return os.environ["ARN"] + + @pytest.fixture def aws_region() -> str: import os return os.environ["AWS_REGION"] + @pytest.fixture def catalog(table_bucket_arn: str, aws_region: str) -> S3TableCatalog: properties = {"s3tables.table-bucket-arn": table_bucket_arn, "s3tables.region": aws_region} @@ -153,15 +156,62 @@ def test_commit_new_column_to_table( assert table.schema().columns[-1].name == "b" -def test_commit_new_data_to_table( - catalog: S3TableCatalog, database_name: str, table_name: str, table_schema_nested: Schema -) -> None: +def test_write_pyarrow_table(catalog: S3TableCatalog, database_name: str, table_name: str) -> None: identifier = (database_name, table_name) + catalog.create_namespace(namespace=database_name) + import pyarrow as pa + + pyarrow_table = pa.Table.from_arrays( + [ + pa.array([None, "A", "B", "C"]), # 'foo' column + pa.array([1, 2, 3, 4]), # 'bar' column + pa.array([True, None, False, True]), # 'baz' column + pa.array([None, "A", "B", "C"]), # 'large' column + ], + schema=pa.schema( + [ + pa.field("foo", pa.large_string(), nullable=True), + pa.field("bar", pa.int32(), nullable=False), + pa.field("baz", pa.bool_(), nullable=True), + pa.field("large", pa.large_string(), nullable=True), + ] + ), + ) + table = catalog.create_table(identifier=identifier, schema=pyarrow_table.schema) + table.append(pyarrow_table) + + assert table.scan().to_arrow().num_rows == pyarrow_table.num_rows + + +def test_commit_new_data_to_table(catalog: S3TableCatalog, database_name: str, table_name: str) -> None: + identifier = (database_name, table_name) catalog.create_namespace(namespace=database_name) - table = catalog.create_table(identifier=identifier, schema=table_schema_nested) + + import pyarrow as pa + + pyarrow_table = pa.Table.from_arrays( + [ + pa.array([None, "A", "B", "C"]), # 'foo' column + pa.array([1, 2, 3, 4]), # 'bar' column + pa.array([True, None, False, True]), # 'baz' column + pa.array([None, "A", "B", "C"]), # 'large' column + ], + schema=pa.schema( + [ + pa.field("foo", pa.large_string(), nullable=True), + pa.field("bar", pa.int32(), nullable=False), + pa.field("baz", pa.bool_(), nullable=True), + pa.field("large", pa.large_string(), nullable=True), + ] + ), + ) + + table = catalog.create_table(identifier=identifier, schema=pyarrow_table.schema) + table.append(pyarrow_table) row_count = table.scan().to_arrow().num_rows + assert row_count last_updated_ms = table.metadata.last_updated_ms original_table_metadata_location = table.metadata_location original_table_last_updated_ms = table.metadata.last_updated_ms @@ -172,6 +222,6 @@ def test_commit_new_data_to_table( updated_table_metadata = table.metadata assert updated_table_metadata.last_updated_ms > last_updated_ms - assert updated_table_metadata.metadata_log[0].metadata_file == original_table_metadata_location - assert updated_table_metadata.metadata_log[0].timestamp_ms == original_table_last_updated_ms + assert updated_table_metadata.metadata_log[-1].metadata_file == original_table_metadata_location + assert updated_table_metadata.metadata_log[-1].timestamp_ms == original_table_last_updated_ms assert table.scan().to_arrow().num_rows == 2 * row_count From 589df88b557ee11fe3e0a85afaba8a0dd5862060 Mon Sep 17 00:00:00 2001 From: Felix Scherz Date: Mon, 6 Jan 2025 17:56:05 +0100 Subject: [PATCH 41/60] feat: clarify update_namespace_properties error --- pyiceberg/catalog/s3tables.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyiceberg/catalog/s3tables.py b/pyiceberg/catalog/s3tables.py index 1408322ea7..01c42f9e3b 100644 --- a/pyiceberg/catalog/s3tables.py +++ b/pyiceberg/catalog/s3tables.py @@ -305,7 +305,7 @@ def update_namespace_properties( self, namespace: Union[str, Identifier], removals: Optional[Set[str]] = None, updates: Properties = EMPTY_DICT ) -> PropertiesUpdateSummary: # namespace properties are read only - raise NotImplementedError + raise NotImplementedError("Namespace properties are read only") def purge_table(self, identifier: Union[str, Identifier]) -> None: # purge is not supported as s3tables doesn't support delete operations From 54552028b80c0d9f44ee1b8c5c36b92764f08bcc Mon Sep 17 00:00:00 2001 From: Felix Scherz Date: Mon, 6 Jan 2025 17:59:55 +0100 Subject: [PATCH 42/60] feat: raise error when setting custom namespace properties --- pyiceberg/catalog/s3tables.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pyiceberg/catalog/s3tables.py b/pyiceberg/catalog/s3tables.py index 01c42f9e3b..d5f5bbc305 100644 --- a/pyiceberg/catalog/s3tables.py +++ b/pyiceberg/catalog/s3tables.py @@ -106,6 +106,8 @@ def commit_table( ) def create_namespace(self, namespace: Union[str, Identifier], properties: Properties = EMPTY_DICT) -> None: + if properties: + raise NotImplementedError("Setting namespace properties is not supported.") valid_namespace: str = self._validate_namespace_identifier(namespace) self.s3tables.create_namespace(tableBucketARN=self.table_bucket_arn, namespace=[valid_namespace]) @@ -305,7 +307,7 @@ def update_namespace_properties( self, namespace: Union[str, Identifier], removals: Optional[Set[str]] = None, updates: Properties = EMPTY_DICT ) -> PropertiesUpdateSummary: # namespace properties are read only - raise NotImplementedError("Namespace properties are read only") + raise NotImplementedError("Namespace properties are read only.") def purge_table(self, identifier: Union[str, Identifier]) -> None: # purge is not supported as s3tables doesn't support delete operations From 121b19f153183eedd0b26ab070b8945f307b87f1 Mon Sep 17 00:00:00 2001 From: Felix Scherz Date: Tue, 7 Jan 2025 08:57:31 +0100 Subject: [PATCH 43/60] refactor: change S3TableCatalog -> S3TablesCatalog --- pyiceberg/catalog/s3tables.py | 2 +- tests/catalog/test_s3tables.py | 32 ++++++++++++++++---------------- 2 files changed, 17 insertions(+), 17 deletions(-) diff --git a/pyiceberg/catalog/s3tables.py b/pyiceberg/catalog/s3tables.py index d5f5bbc305..ed6a8027f2 100644 --- a/pyiceberg/catalog/s3tables.py +++ b/pyiceberg/catalog/s3tables.py @@ -44,7 +44,7 @@ S3TABLES_RESERVED_NAMESPACE = "aws_s3_metadata" -class S3TableCatalog(MetastoreCatalog): +class S3TablesCatalog(MetastoreCatalog): def __init__(self, name: str, **properties: str): super().__init__(name, **properties) diff --git a/tests/catalog/test_s3tables.py b/tests/catalog/test_s3tables.py index d1f6e6ffd1..c5cf959d4a 100644 --- a/tests/catalog/test_s3tables.py +++ b/tests/catalog/test_s3tables.py @@ -1,6 +1,6 @@ import pytest -from pyiceberg.catalog.s3tables import S3TableCatalog +from pyiceberg.catalog.s3tables import S3TablesCatalog from pyiceberg.exceptions import NoSuchNamespaceError, NoSuchTableError, TableBucketNotFound from pyiceberg.schema import Schema from pyiceberg.types import IntegerType @@ -36,36 +36,36 @@ def aws_region() -> str: @pytest.fixture -def catalog(table_bucket_arn: str, aws_region: str) -> S3TableCatalog: +def catalog(table_bucket_arn: str, aws_region: str) -> S3TablesCatalog: properties = {"s3tables.table-bucket-arn": table_bucket_arn, "s3tables.region": aws_region} - return S3TableCatalog(name="test_s3tables_catalog", **properties) + return S3TablesCatalog(name="test_s3tables_catalog", **properties) def test_creating_catalog_validates_s3_table_bucket_exists(table_bucket_arn: str) -> None: properties = {"s3tables.table-bucket-arn": f"{table_bucket_arn}-modified", "s3tables.region": "us-east-1"} with pytest.raises(TableBucketNotFound): - S3TableCatalog(name="test_s3tables_catalog", **properties) + S3TablesCatalog(name="test_s3tables_catalog", **properties) -def test_create_namespace(catalog: S3TableCatalog, database_name: str) -> None: +def test_create_namespace(catalog: S3TablesCatalog, database_name: str) -> None: catalog.create_namespace(namespace=database_name) namespaces = catalog.list_namespaces() assert (database_name,) in namespaces -def test_load_namespace_properties(catalog: S3TableCatalog, database_name: str) -> None: +def test_load_namespace_properties(catalog: S3TablesCatalog, database_name: str) -> None: catalog.create_namespace(namespace=database_name) assert database_name in catalog.load_namespace_properties(database_name)["namespace"] -def test_drop_namespace(catalog: S3TableCatalog, database_name: str) -> None: +def test_drop_namespace(catalog: S3TablesCatalog, database_name: str) -> None: catalog.create_namespace(namespace=database_name) assert (database_name,) in catalog.list_namespaces() catalog.drop_namespace(namespace=database_name) assert (database_name,) not in catalog.list_namespaces() -def test_create_table(catalog: S3TableCatalog, database_name: str, table_name: str, table_schema_nested: Schema) -> None: +def test_create_table(catalog: S3TablesCatalog, database_name: str, table_name: str, table_schema_nested: Schema) -> None: identifier = (database_name, table_name) catalog.create_namespace(namespace=database_name) @@ -75,7 +75,7 @@ def test_create_table(catalog: S3TableCatalog, database_name: str, table_name: s def test_create_table_in_invalid_namespace_raises_exception( - catalog: S3TableCatalog, database_name: str, table_name: str, table_schema_nested: Schema + catalog: S3TablesCatalog, database_name: str, table_name: str, table_schema_nested: Schema ) -> None: identifier = (database_name, table_name) @@ -83,7 +83,7 @@ def test_create_table_in_invalid_namespace_raises_exception( catalog.create_table(identifier=identifier, schema=table_schema_nested) -def test_table_exists(catalog: S3TableCatalog, database_name: str, table_name: str, table_schema_nested: Schema) -> None: +def test_table_exists(catalog: S3TablesCatalog, database_name: str, table_name: str, table_schema_nested: Schema) -> None: identifier = (database_name, table_name) catalog.create_namespace(namespace=database_name) @@ -92,7 +92,7 @@ def test_table_exists(catalog: S3TableCatalog, database_name: str, table_name: s assert catalog.table_exists(identifier=identifier) -def test_rename_table(catalog: S3TableCatalog, database_name: str, table_name: str, table_schema_nested: Schema) -> None: +def test_rename_table(catalog: S3TablesCatalog, database_name: str, table_name: str, table_schema_nested: Schema) -> None: identifier = (database_name, table_name) catalog.create_namespace(namespace=database_name) @@ -108,7 +108,7 @@ def test_rename_table(catalog: S3TableCatalog, database_name: str, table_name: s assert catalog.table_exists(identifier=to_identifier) -def test_list_tables(catalog: S3TableCatalog, database_name: str, table_name: str, table_schema_nested: Schema) -> None: +def test_list_tables(catalog: S3TablesCatalog, database_name: str, table_name: str, table_schema_nested: Schema) -> None: identifier = (database_name, table_name) catalog.create_namespace(namespace=database_name) @@ -117,7 +117,7 @@ def test_list_tables(catalog: S3TableCatalog, database_name: str, table_name: st assert catalog.list_tables(namespace=database_name) -def test_drop_table(catalog: S3TableCatalog, database_name: str, table_name: str, table_schema_nested: Schema) -> None: +def test_drop_table(catalog: S3TablesCatalog, database_name: str, table_name: str, table_schema_nested: Schema) -> None: identifier = (database_name, table_name) catalog.create_namespace(namespace=database_name) @@ -130,7 +130,7 @@ def test_drop_table(catalog: S3TableCatalog, database_name: str, table_name: str def test_commit_new_column_to_table( - catalog: S3TableCatalog, database_name: str, table_name: str, table_schema_nested: Schema + catalog: S3TablesCatalog, database_name: str, table_name: str, table_schema_nested: Schema ) -> None: identifier = (database_name, table_name) @@ -156,7 +156,7 @@ def test_commit_new_column_to_table( assert table.schema().columns[-1].name == "b" -def test_write_pyarrow_table(catalog: S3TableCatalog, database_name: str, table_name: str) -> None: +def test_write_pyarrow_table(catalog: S3TablesCatalog, database_name: str, table_name: str) -> None: identifier = (database_name, table_name) catalog.create_namespace(namespace=database_name) @@ -184,7 +184,7 @@ def test_write_pyarrow_table(catalog: S3TableCatalog, database_name: str, table_ assert table.scan().to_arrow().num_rows == pyarrow_table.num_rows -def test_commit_new_data_to_table(catalog: S3TableCatalog, database_name: str, table_name: str) -> None: +def test_commit_new_data_to_table(catalog: S3TablesCatalog, database_name: str, table_name: str) -> None: identifier = (database_name, table_name) catalog.create_namespace(namespace=database_name) From eca21866c2f57a3aa26360f7e86234c075165bad Mon Sep 17 00:00:00 2001 From: Felix Scherz Date: Tue, 7 Jan 2025 08:59:42 +0100 Subject: [PATCH 44/60] feat: raise error on specified table location --- pyiceberg/catalog/s3tables.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pyiceberg/catalog/s3tables.py b/pyiceberg/catalog/s3tables.py index ed6a8027f2..a732afcdd9 100644 --- a/pyiceberg/catalog/s3tables.py +++ b/pyiceberg/catalog/s3tables.py @@ -138,6 +138,8 @@ def create_table( sort_order: SortOrder = UNSORTED_SORT_ORDER, properties: Properties = EMPTY_DICT, ) -> Table: + if location: + raise NotImplementedError("S3 Tables does not support user specified table locations.") namespace, table_name = self._validate_database_and_table_identifier(identifier) schema: Schema = self._convert_schema_if_needed(schema) # type: ignore From 42245bfd14c9873faec9bd074e781233374c7040 Mon Sep 17 00:00:00 2001 From: Felix Scherz Date: Tue, 7 Jan 2025 17:57:00 +0100 Subject: [PATCH 45/60] feat: return empty list when querying a hierarchical namespace --- pyiceberg/catalog/s3tables.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pyiceberg/catalog/s3tables.py b/pyiceberg/catalog/s3tables.py index a732afcdd9..0237dcd0ae 100644 --- a/pyiceberg/catalog/s3tables.py +++ b/pyiceberg/catalog/s3tables.py @@ -220,7 +220,8 @@ def drop_table(self, identifier: Union[str, Identifier]) -> None: def list_namespaces(self, namespace: Union[str, Identifier] = ()) -> List[Identifier]: if namespace: - namespace = self._validate_namespace_identifier(namespace) + # hierarchical namespaces are not supported + return [] paginator = self.s3tables.get_paginator("list_namespaces") namespaces: List[Identifier] = [] From 5977c234c3792e3d24e1295d3f402da7770420e1 Mon Sep 17 00:00:00 2001 From: Felix Scherz Date: Tue, 7 Jan 2025 18:00:22 +0100 Subject: [PATCH 46/60] refactor: use get_table_metadata_location instead of get_table --- pyiceberg/catalog/s3tables.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyiceberg/catalog/s3tables.py b/pyiceberg/catalog/s3tables.py index 0237dcd0ae..6384823705 100644 --- a/pyiceberg/catalog/s3tables.py +++ b/pyiceberg/catalog/s3tables.py @@ -158,7 +158,7 @@ def create_table( f"Cannot create {namespace}.{table_name} because a table of the same name already exists in the namespace." ) from e - response = self.s3tables.get_table(tableBucketARN=self.table_bucket_arn, namespace=namespace, name=table_name) + response = self.s3tables.get_table_metadata_location(tableBucketARN=self.table_bucket_arn, namespace=namespace, name=table_name) version_token = response["versionToken"] warehouse_location = response["warehouseLocation"] From 2dbac34d6a0969d2b2272c7d165276f86fc97bae Mon Sep 17 00:00:00 2001 From: Felix Scherz Date: Tue, 7 Jan 2025 18:02:57 +0100 Subject: [PATCH 47/60] refactor: extract 'ICEBERG' table format into constant --- pyiceberg/catalog/s3tables.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pyiceberg/catalog/s3tables.py b/pyiceberg/catalog/s3tables.py index 6384823705..efc22a42c3 100644 --- a/pyiceberg/catalog/s3tables.py +++ b/pyiceberg/catalog/s3tables.py @@ -43,6 +43,8 @@ S3TABLES_VALID_NAME_REGEX = pattern = re.compile("[a-z0-9][a-z0-9_]{2,62}") S3TABLES_RESERVED_NAMESPACE = "aws_s3_metadata" +S3TABLES_FORMAT = "ICEBERG" + class S3TablesCatalog(MetastoreCatalog): def __init__(self, name: str, **properties: str): @@ -149,7 +151,7 @@ def create_table( # created S3 Table. try: self.s3tables.create_table( - tableBucketARN=self.table_bucket_arn, namespace=namespace, name=table_name, format="ICEBERG" + tableBucketARN=self.table_bucket_arn, namespace=namespace, name=table_name, format=S3TABLES_FORMAT ) except self.s3tables.exceptions.NotFoundException as e: raise NoSuchNamespaceError(f"Cannot create {namespace}.{table_name} because no such namespace exists.") from e From ba76c1576dd2b7f723e6ba09597f80b0e8c9068f Mon Sep 17 00:00:00 2001 From: Felix Scherz Date: Tue, 7 Jan 2025 18:05:44 +0100 Subject: [PATCH 48/60] feat: change s3tables.table-bucket-arn -> s3tables.warehouse --- pyiceberg/catalog/s3tables.py | 2 +- tests/catalog/test_s3tables.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/pyiceberg/catalog/s3tables.py b/pyiceberg/catalog/s3tables.py index efc22a42c3..4d22807687 100644 --- a/pyiceberg/catalog/s3tables.py +++ b/pyiceberg/catalog/s3tables.py @@ -35,7 +35,7 @@ S3TABLES_SECRET_ACCESS_KEY = "s3tables.secret-access-key" S3TABLES_SESSION_TOKEN = "s3tables.session-token" -S3TABLES_TABLE_BUCKET_ARN = "s3tables.table-bucket-arn" +S3TABLES_TABLE_BUCKET_ARN = "s3tables.warehouse" S3TABLES_ENDPOINT = "s3tables.endpoint" diff --git a/tests/catalog/test_s3tables.py b/tests/catalog/test_s3tables.py index c5cf959d4a..dc3bb88238 100644 --- a/tests/catalog/test_s3tables.py +++ b/tests/catalog/test_s3tables.py @@ -37,12 +37,12 @@ def aws_region() -> str: @pytest.fixture def catalog(table_bucket_arn: str, aws_region: str) -> S3TablesCatalog: - properties = {"s3tables.table-bucket-arn": table_bucket_arn, "s3tables.region": aws_region} + properties = {"s3tables.warehouse": table_bucket_arn, "s3tables.region": aws_region} return S3TablesCatalog(name="test_s3tables_catalog", **properties) def test_creating_catalog_validates_s3_table_bucket_exists(table_bucket_arn: str) -> None: - properties = {"s3tables.table-bucket-arn": f"{table_bucket_arn}-modified", "s3tables.region": "us-east-1"} + properties = {"s3tables.warehouse": f"{table_bucket_arn}-modified", "s3tables.region": "us-east-1"} with pytest.raises(TableBucketNotFound): S3TablesCatalog(name="test_s3tables_catalog", **properties) From 2ff29a5569a9d81bcde28dccabcd32dd80d73e3a Mon Sep 17 00:00:00 2001 From: Felix Scherz Date: Tue, 7 Jan 2025 18:09:03 +0100 Subject: [PATCH 49/60] Apply suggestions from code review Co-authored-by: Honah J. --- pyiceberg/catalog/s3tables.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyiceberg/catalog/s3tables.py b/pyiceberg/catalog/s3tables.py index 4d22807687..1854dbfc9b 100644 --- a/pyiceberg/catalog/s3tables.py +++ b/pyiceberg/catalog/s3tables.py @@ -40,7 +40,7 @@ S3TABLES_ENDPOINT = "s3tables.endpoint" # for naming rules see: https://docs.aws.amazon.com/AmazonS3/latest/userguide/s3-tables-buckets-naming.html -S3TABLES_VALID_NAME_REGEX = pattern = re.compile("[a-z0-9][a-z0-9_]{2,62}") +S3TABLES_VALID_NAME_REGEX = pattern = re.compile("[a-z0-9][a-z0-9_]{1,61}[a-z0-9]") S3TABLES_RESERVED_NAMESPACE = "aws_s3_metadata" S3TABLES_FORMAT = "ICEBERG" From 1afaa8c6d5adacb49f4ec1a901638a9847b247ab Mon Sep 17 00:00:00 2001 From: Felix Scherz Date: Tue, 7 Jan 2025 18:13:19 +0100 Subject: [PATCH 50/60] feat: add link to naming-rules for invalid name errors --- pyiceberg/catalog/s3tables.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pyiceberg/catalog/s3tables.py b/pyiceberg/catalog/s3tables.py index 1854dbfc9b..4d89108d8d 100644 --- a/pyiceberg/catalog/s3tables.py +++ b/pyiceberg/catalog/s3tables.py @@ -117,7 +117,7 @@ def _validate_namespace_identifier(self, namespace: Union[str, Identifier]) -> s namespace = self.identifier_to_database(namespace) if not S3TABLES_VALID_NAME_REGEX.fullmatch(namespace) or namespace == S3TABLES_RESERVED_NAMESPACE: - raise InvalidNamespaceName("The specified namespace name is not valid.") + raise InvalidNamespaceName("The specified namespace name is not valid. See https://docs.aws.amazon.com/AmazonS3/latest/userguide/s3-tables-buckets-naming.html for naming rules.") return namespace @@ -127,7 +127,7 @@ def _validate_database_and_table_identifier(self, identifier: Union[str, Identif namespace = self._validate_namespace_identifier(namespace) if not S3TABLES_VALID_NAME_REGEX.fullmatch(table_name): - raise InvalidTableName("The specified table name is not valid.") + raise InvalidTableName("The specified table name is not valid. See https://docs.aws.amazon.com/AmazonS3/latest/userguide/s3-tables-buckets-naming.html for naming rules.") return namespace, table_name From 94ce254ee936ddbbe00da28b5b35b05be4b008a6 Mon Sep 17 00:00:00 2001 From: Felix Scherz Date: Tue, 7 Jan 2025 18:22:14 +0100 Subject: [PATCH 51/60] feat: delete s3 table if writing new_table_metadata is unsuccessful --- pyiceberg/catalog/s3tables.py | 63 ++++++++++++++++++----------------- 1 file changed, 33 insertions(+), 30 deletions(-) diff --git a/pyiceberg/catalog/s3tables.py b/pyiceberg/catalog/s3tables.py index 4d89108d8d..2cfdf9595b 100644 --- a/pyiceberg/catalog/s3tables.py +++ b/pyiceberg/catalog/s3tables.py @@ -150,9 +150,9 @@ def create_table( # S3 Tables API and then write the new metadata.json to the warehouseLocation associated with the newly # created S3 Table. try: - self.s3tables.create_table( + version_token = self.s3tables.create_table( tableBucketARN=self.table_bucket_arn, namespace=namespace, name=table_name, format=S3TABLES_FORMAT - ) + )["versionToken"] except self.s3tables.exceptions.NotFoundException as e: raise NoSuchNamespaceError(f"Cannot create {namespace}.{table_name} because no such namespace exists.") from e except self.s3tables.exceptions.ConflictException as e: @@ -160,36 +160,39 @@ def create_table( f"Cannot create {namespace}.{table_name} because a table of the same name already exists in the namespace." ) from e - response = self.s3tables.get_table_metadata_location(tableBucketARN=self.table_bucket_arn, namespace=namespace, name=table_name) - version_token = response["versionToken"] - - warehouse_location = response["warehouseLocation"] - metadata_location = self._get_metadata_location(location=warehouse_location) - metadata = new_table_metadata( - location=warehouse_location, - schema=schema, - partition_spec=partition_spec, - sort_order=sort_order, - properties=properties, - ) - - io = load_file_io(properties=self.properties, location=metadata_location) - # this triggers unsupported list operation error as S3 Table Buckets only support a subset of the S3 Bucket API, - # setting overwrite=True is a workaround for now since it prevents a call to list_objects - self._write_metadata(metadata, io, metadata_location, overwrite=True) - try: - self.s3tables.update_table_metadata_location( - tableBucketARN=self.table_bucket_arn, - namespace=namespace, - name=table_name, - versionToken=version_token, - metadataLocation=metadata_location, + response = self.s3tables.get_table_metadata_location(tableBucketARN=self.table_bucket_arn, namespace=namespace, name=table_name) + warehouse_location = response["warehouseLocation"] + + metadata_location = self._get_metadata_location(location=warehouse_location) + metadata = new_table_metadata( + location=warehouse_location, + schema=schema, + partition_spec=partition_spec, + sort_order=sort_order, + properties=properties, ) - except self.s3tables.exceptions.ConflictException as e: - raise CommitFailedException( - f"Cannot create {namespace}.{table_name} because of a concurrent update to the table version {version_token}." - ) from e + + io = load_file_io(properties=self.properties, location=metadata_location) + # this triggers unsupported list operation error as S3 Table Buckets only support a subset of the S3 Bucket API, + # setting overwrite=True is a workaround for now since it prevents a call to list_objects + self._write_metadata(metadata, io, metadata_location, overwrite=True) + + try: + self.s3tables.update_table_metadata_location( + tableBucketARN=self.table_bucket_arn, + namespace=namespace, + name=table_name, + versionToken=version_token, + metadataLocation=metadata_location, + ) + except self.s3tables.exceptions.ConflictException as e: + raise CommitFailedException( + f"Cannot create {namespace}.{table_name} because of a concurrent update to the table version {version_token}." + ) from e + except: + self.s3tables.delete_table(tableBucketARN=self.table_bucket_arn, namespace=namespace, name=table_name) + raise return self.load_table(identifier=identifier) From 61d0e05ff4ce26d40be0f97780d73de81041e0fe Mon Sep 17 00:00:00 2001 From: Felix Scherz Date: Tue, 7 Jan 2025 18:22:38 +0100 Subject: [PATCH 52/60] chore: run linter --- pyiceberg/catalog/s3tables.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/pyiceberg/catalog/s3tables.py b/pyiceberg/catalog/s3tables.py index 2cfdf9595b..1e6b8585d6 100644 --- a/pyiceberg/catalog/s3tables.py +++ b/pyiceberg/catalog/s3tables.py @@ -117,7 +117,9 @@ def _validate_namespace_identifier(self, namespace: Union[str, Identifier]) -> s namespace = self.identifier_to_database(namespace) if not S3TABLES_VALID_NAME_REGEX.fullmatch(namespace) or namespace == S3TABLES_RESERVED_NAMESPACE: - raise InvalidNamespaceName("The specified namespace name is not valid. See https://docs.aws.amazon.com/AmazonS3/latest/userguide/s3-tables-buckets-naming.html for naming rules.") + raise InvalidNamespaceName( + "The specified namespace name is not valid. See https://docs.aws.amazon.com/AmazonS3/latest/userguide/s3-tables-buckets-naming.html for naming rules." + ) return namespace @@ -127,7 +129,9 @@ def _validate_database_and_table_identifier(self, identifier: Union[str, Identif namespace = self._validate_namespace_identifier(namespace) if not S3TABLES_VALID_NAME_REGEX.fullmatch(table_name): - raise InvalidTableName("The specified table name is not valid. See https://docs.aws.amazon.com/AmazonS3/latest/userguide/s3-tables-buckets-naming.html for naming rules.") + raise InvalidTableName( + "The specified table name is not valid. See https://docs.aws.amazon.com/AmazonS3/latest/userguide/s3-tables-buckets-naming.html for naming rules." + ) return namespace, table_name @@ -161,7 +165,9 @@ def create_table( ) from e try: - response = self.s3tables.get_table_metadata_location(tableBucketARN=self.table_bucket_arn, namespace=namespace, name=table_name) + response = self.s3tables.get_table_metadata_location( + tableBucketARN=self.table_bucket_arn, namespace=namespace, name=table_name + ) warehouse_location = response["warehouseLocation"] metadata_location = self._get_metadata_location(location=warehouse_location) From 296b14ef777248ad7fcf7e57387b6a5244736e89 Mon Sep 17 00:00:00 2001 From: Felix Scherz Date: Tue, 7 Jan 2025 19:15:06 +0100 Subject: [PATCH 53/60] test: rename test_s3tables.py -> integration_test_s3tables.py --- ..._s3tables.py => integration_test_s3tables.py} | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) rename tests/catalog/{test_s3tables.py => integration_test_s3tables.py} (93%) diff --git a/tests/catalog/test_s3tables.py b/tests/catalog/integration_test_s3tables.py similarity index 93% rename from tests/catalog/test_s3tables.py rename to tests/catalog/integration_test_s3tables.py index dc3bb88238..7b880bee31 100644 --- a/tests/catalog/test_s3tables.py +++ b/tests/catalog/integration_test_s3tables.py @@ -20,19 +20,25 @@ def table_name(table_name: str) -> str: @pytest.fixture def table_bucket_arn() -> str: + """Set the environment variable AWS_TEST_S3_TABLE_BUCKET_ARN for an S3 table bucket to test.""" import os - # since the moto library does not support s3tables as of 2024-12-14 we have to test against a real AWS endpoint - # in one of the supported regions. - - return os.environ["ARN"] + table_bucket_arn = os.getenv("AWS_TEST_S3_TABLE_BUCKET_ARN") + if not table_bucket_arn: + raise ValueError( + "Please specify a table bucket arn to run the test by setting environment variable AWS_TEST_S3_TABLE_BUCKET_ARN" + ) + return table_bucket_arn @pytest.fixture def aws_region() -> str: import os - return os.environ["AWS_REGION"] + aws_region = os.getenv("AWS_REGION") + if not aws_region: + raise ValueError("Please specify an AWS region to run the test by setting environment variable AWS_REGION") + return aws_region @pytest.fixture From eb71a1f7badf3e4cb16d290243b0d6556630a432 Mon Sep 17 00:00:00 2001 From: Felix Scherz Date: Wed, 8 Jan 2025 20:19:53 +0100 Subject: [PATCH 54/60] fix: add license to files --- pyiceberg/catalog/s3tables.py | 16 ++++++++++++++++ tests/catalog/integration_test_s3tables.py | 16 ++++++++++++++++ 2 files changed, 32 insertions(+) diff --git a/pyiceberg/catalog/s3tables.py b/pyiceberg/catalog/s3tables.py index 1e6b8585d6..a577613c92 100644 --- a/pyiceberg/catalog/s3tables.py +++ b/pyiceberg/catalog/s3tables.py @@ -1,3 +1,19 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. import re from typing import TYPE_CHECKING, List, Optional, Set, Tuple, Union diff --git a/tests/catalog/integration_test_s3tables.py b/tests/catalog/integration_test_s3tables.py index 7b880bee31..351f530122 100644 --- a/tests/catalog/integration_test_s3tables.py +++ b/tests/catalog/integration_test_s3tables.py @@ -1,3 +1,19 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. import pytest from pyiceberg.catalog.s3tables import S3TablesCatalog From 6eb24c96f030bf3274a544784d04710b978fdfb9 Mon Sep 17 00:00:00 2001 From: Felix Scherz Date: Thu, 9 Jan 2025 08:56:43 +0100 Subject: [PATCH 55/60] fix: raise error when creating a table during a transaction --- pyiceberg/catalog/s3tables.py | 79 ++++++++++++++-------- tests/catalog/integration_test_s3tables.py | 36 ++++++++++ 2 files changed, 86 insertions(+), 29 deletions(-) diff --git a/pyiceberg/catalog/s3tables.py b/pyiceberg/catalog/s3tables.py index a577613c92..14c249448e 100644 --- a/pyiceberg/catalog/s3tables.py +++ b/pyiceberg/catalog/s3tables.py @@ -35,7 +35,7 @@ from pyiceberg.partitioning import UNPARTITIONED_PARTITION_SPEC, PartitionSpec from pyiceberg.schema import Schema from pyiceberg.serializers import FromInputFile -from pyiceberg.table import CommitTableResponse, Table +from pyiceberg.table import CommitTableResponse, CreateTableTransaction, Table from pyiceberg.table.metadata import new_table_metadata from pyiceberg.table.sorting import UNSORTED_SORT_ORDER, SortOrder from pyiceberg.table.update import TableRequirement, TableUpdate @@ -92,36 +92,57 @@ def commit_table( table_identifier = table.name() database_name, table_name = self.identifier_to_database_and_table(table_identifier, NoSuchTableError) - current_table, version_token = self._load_table_and_version(identifier=table_identifier) - - updated_staged_table = self._update_and_stage_table(current_table, table_identifier, requirements, updates) - if current_table and updated_staged_table.metadata == current_table.metadata: - # no changes, do nothing - return CommitTableResponse(metadata=current_table.metadata, metadata_location=current_table.metadata_location) - - self._write_metadata( - metadata=updated_staged_table.metadata, - io=updated_staged_table.io, - metadata_path=updated_staged_table.metadata_location, - overwrite=True, - ) - - # try to update metadata location which will fail if the versionToken changed meanwhile + current_table: Optional[Table] + version_token: Optional[str] try: - self.s3tables.update_table_metadata_location( - tableBucketARN=self.table_bucket_arn, - namespace=database_name, - name=table_name, - versionToken=version_token, - metadataLocation=updated_staged_table.metadata_location, + current_table, version_token = self._load_table_and_version(identifier=table_identifier) + except NoSuchTableError: + current_table = None + version_token = None + + if current_table: + updated_staged_table = self._update_and_stage_table(current_table, table_identifier, requirements, updates) + if updated_staged_table.metadata == current_table.metadata: + # no changes, do nothing + return CommitTableResponse(metadata=current_table.metadata, metadata_location=current_table.metadata_location) + + self._write_metadata( + metadata=updated_staged_table.metadata, + io=updated_staged_table.io, + metadata_path=updated_staged_table.metadata_location, + overwrite=True, ) - except self.s3tables.exceptions.ConflictException as e: - raise CommitFailedException( - f"Cannot commit {database_name}.{table_name} because of a concurrent update to the table version {version_token}." - ) from e - return CommitTableResponse( - metadata=updated_staged_table.metadata, metadata_location=updated_staged_table.metadata_location - ) + + # try to update metadata location which will fail if the versionToken changed meanwhile + try: + self.s3tables.update_table_metadata_location( + tableBucketARN=self.table_bucket_arn, + namespace=database_name, + name=table_name, + versionToken=version_token, + metadataLocation=updated_staged_table.metadata_location, + ) + except self.s3tables.exceptions.ConflictException as e: + raise CommitFailedException( + f"Cannot commit {database_name}.{table_name} because of a concurrent update to the table version {version_token}." + ) from e + return CommitTableResponse( + metadata=updated_staged_table.metadata, metadata_location=updated_staged_table.metadata_location + ) + else: + # table does not exist, create it + raise NotImplementedError("Creating a table on commit is currently not supported.") + + def create_table_transaction( + self, + identifier: Union[str, Identifier], + schema: Union[Schema, "pa.Schema"], + location: Optional[str] = None, + partition_spec: PartitionSpec = UNPARTITIONED_PARTITION_SPEC, + sort_order: SortOrder = UNSORTED_SORT_ORDER, + properties: Properties = EMPTY_DICT, + ) -> CreateTableTransaction: + raise NotImplementedError("create_table_transaction currently not supported.") def create_namespace(self, namespace: Union[str, Identifier], properties: Properties = EMPTY_DICT) -> None: if properties: diff --git a/tests/catalog/integration_test_s3tables.py b/tests/catalog/integration_test_s3tables.py index 351f530122..e803b5a2b9 100644 --- a/tests/catalog/integration_test_s3tables.py +++ b/tests/catalog/integration_test_s3tables.py @@ -18,7 +18,9 @@ from pyiceberg.catalog.s3tables import S3TablesCatalog from pyiceberg.exceptions import NoSuchNamespaceError, NoSuchTableError, TableBucketNotFound +from pyiceberg.partitioning import PartitionField, PartitionSpec from pyiceberg.schema import Schema +from pyiceberg.transforms import IdentityTransform from pyiceberg.types import IntegerType @@ -247,3 +249,37 @@ def test_commit_new_data_to_table(catalog: S3TablesCatalog, database_name: str, assert updated_table_metadata.metadata_log[-1].metadata_file == original_table_metadata_location assert updated_table_metadata.metadata_log[-1].timestamp_ms == original_table_last_updated_ms assert table.scan().to_arrow().num_rows == 2 * row_count + + +def test_create_table_transaction( + catalog: S3TablesCatalog, database_name: str, table_name: str, table_schema_nested: str +) -> None: + identifier = (database_name, table_name) + catalog.create_namespace(namespace=database_name) + + with catalog.create_table_transaction( + identifier, + table_schema_nested, + partition_spec=PartitionSpec(PartitionField(source_id=1, field_id=1000, transform=IdentityTransform(), name="foo")), + ) as txn: + last_updated_metadata = txn.table_metadata.last_updated_ms + with txn.update_schema() as update_schema: + update_schema.add_column(path="b", field_type=IntegerType()) + + with txn.update_spec() as update_spec: + update_spec.add_identity("bar") + + txn.set_properties(test_a="test_aa", test_b="test_b", test_c="test_c") + + table = catalog.load_table(identifier) + + assert table.schema().find_field("b").field_type == IntegerType() + assert table.properties == {"test_a": "test_aa", "test_b": "test_b", "test_c": "test_c"} + assert table.spec().last_assigned_field_id == 1001 + assert table.spec().fields_by_source_id(1)[0].name == "foo" + assert table.spec().fields_by_source_id(1)[0].field_id == 1000 + assert table.spec().fields_by_source_id(1)[0].transform == IdentityTransform() + assert table.spec().fields_by_source_id(2)[0].name == "bar" + assert table.spec().fields_by_source_id(2)[0].field_id == 1001 + assert table.spec().fields_by_source_id(2)[0].transform == IdentityTransform() + assert table.metadata.last_updated_ms > last_updated_metadata From 54b8e877e552b4d673d770525c16bbe09da08fa8 Mon Sep 17 00:00:00 2001 From: Felix Scherz Date: Thu, 9 Jan 2025 09:01:26 +0100 Subject: [PATCH 56/60] test: mark create_table_transaction test wiht xfail --- tests/catalog/integration_test_s3tables.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/catalog/integration_test_s3tables.py b/tests/catalog/integration_test_s3tables.py index e803b5a2b9..8cfcbe7b44 100644 --- a/tests/catalog/integration_test_s3tables.py +++ b/tests/catalog/integration_test_s3tables.py @@ -251,6 +251,7 @@ def test_commit_new_data_to_table(catalog: S3TablesCatalog, database_name: str, assert table.scan().to_arrow().num_rows == 2 * row_count +@pytest.mark.xfail(raises=NotImplementedError, reason="create_table_transaction not implemented yet") def test_create_table_transaction( catalog: S3TablesCatalog, database_name: str, table_name: str, table_schema_nested: str ) -> None: From e9e2cf755d8e038d01db3f41a5c1024868a65631 Mon Sep 17 00:00:00 2001 From: Felix Scherz Date: Sat, 1 Feb 2025 19:59:02 +0100 Subject: [PATCH 57/60] feat: raise NotImplementedError for view_exists --- pyiceberg/catalog/s3tables.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pyiceberg/catalog/s3tables.py b/pyiceberg/catalog/s3tables.py index 14c249448e..16da5346e0 100644 --- a/pyiceberg/catalog/s3tables.py +++ b/pyiceberg/catalog/s3tables.py @@ -373,3 +373,6 @@ def drop_view(self, identifier: Union[str, Identifier]) -> None: def list_views(self, namespace: Union[str, Identifier]) -> List[Identifier]: raise NotImplementedError + + def view_exists(self, identifier: Union[str, Identifier]) -> bool: + raise NotImplementedError From cf020b30b068b5d6be468b797d69e74e9c83de30 Mon Sep 17 00:00:00 2001 From: Felix Scherz Date: Sun, 2 Feb 2025 12:21:57 +0100 Subject: [PATCH 58/60] test: use moto server for s3tables tests --- pyiceberg/catalog/__init__.py | 11 ++ tests/catalog/test_s3tables.py | 307 +++++++++++++++++++++++++++++++++ 2 files changed, 318 insertions(+) create mode 100644 tests/catalog/test_s3tables.py diff --git a/pyiceberg/catalog/__init__.py b/pyiceberg/catalog/__init__.py index c54ccfc47d..c79f4d00fd 100644 --- a/pyiceberg/catalog/__init__.py +++ b/pyiceberg/catalog/__init__.py @@ -116,6 +116,7 @@ class CatalogType(Enum): GLUE = "glue" DYNAMODB = "dynamodb" SQL = "sql" + S3TABLES = "s3tables" def load_rest(name: str, conf: Properties) -> Catalog: @@ -162,12 +163,22 @@ def load_sql(name: str, conf: Properties) -> Catalog: ) from exc +def load_s3tables(name: str, conf: Properties) -> Catalog: + try: + from pyiceberg.catalog.s3tables import S3TablesCatalog + + return S3TablesCatalog(name, **conf) + except ImportError as exc: + raise NotInstalledError("AWS S3Tables support not installed: pip install 'pyiceberg[s3tables]'") from exc + + AVAILABLE_CATALOGS: dict[CatalogType, Callable[[str, Properties], Catalog]] = { CatalogType.REST: load_rest, CatalogType.HIVE: load_hive, CatalogType.GLUE: load_glue, CatalogType.DYNAMODB: load_dynamodb, CatalogType.SQL: load_sql, + CatalogType.S3TABLES: load_s3tables, } diff --git a/tests/catalog/test_s3tables.py b/tests/catalog/test_s3tables.py new file mode 100644 index 0000000000..8ae0ff1da0 --- /dev/null +++ b/tests/catalog/test_s3tables.py @@ -0,0 +1,307 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +import string +from random import choice + +import boto3 +import pytest + +from pyiceberg.catalog import load_catalog +from pyiceberg.catalog.s3tables import S3TablesCatalog +from pyiceberg.exceptions import NoSuchNamespaceError, NoSuchTableError, TableBucketNotFound +from pyiceberg.partitioning import PartitionField, PartitionSpec +from pyiceberg.schema import Schema +from pyiceberg.transforms import IdentityTransform +from pyiceberg.types import IntegerType + + +@pytest.fixture +def database_name(database_name: str) -> str: + # naming rules prevent "-" in namespaces for s3 table buckets + return database_name.replace("-", "_") + + +@pytest.fixture +def table_name(table_name: str) -> str: + # naming rules prevent "-" in table namees for s3 table buckets + return table_name.replace("-", "_") + + +@pytest.fixture +def aws_region() -> str: + return "us-east-1" + + +@pytest.fixture +def table_bucket_arn(monkeypatch: pytest.MonkeyPatch, moto_endpoint_url: str) -> str: + monkeypatch.setenv("AWS_ENDPOINT_URL", moto_endpoint_url) + + prefix = "pyiceberg-table-bucket-" + random_tag = "".join(choice(string.ascii_letters) for _ in range(12)) + name = (prefix + random_tag).lower() + table_bucket_arn = boto3.client("s3tables", endpoint_url=moto_endpoint_url).create_table_bucket(name=name)["arn"] + return table_bucket_arn + + +@pytest.fixture(params=["pyiceberg.io.fsspec.FsspecFileIO", "pyiceberg.io.pyarrow.PyArrowFileIO"]) +def file_io_impl(request: pytest.FixtureRequest) -> str: + return request.param + + +@pytest.fixture +def catalog(table_bucket_arn: str, aws_region: str, moto_endpoint_url: str, file_io_impl: str) -> S3TablesCatalog: + properties = { + "s3tables.warehouse": table_bucket_arn, + "s3tables.region": aws_region, + "py-io-impl": file_io_impl, + "s3tables.endpoint": moto_endpoint_url, + "s3.endpoint": moto_endpoint_url, + } + return S3TablesCatalog(name="test_s3tables_catalog", **properties) + + +def test_load_catalog(table_bucket_arn: str, aws_region: str, moto_endpoint_url: str) -> None: + properties = { + "type": "s3tables", + "s3tables.warehouse": table_bucket_arn, + "s3tables.region": aws_region, + "py-io-impl": "pyiceberg.io.pyarrow.PyArrowFileIO", + "s3tables.endpoint": moto_endpoint_url, + } + catalog = load_catalog(**properties) + assert isinstance(catalog, S3TablesCatalog) + + +def test_creating_catalog_validates_s3_table_bucket_exists(table_bucket_arn: str) -> None: + properties = {"s3tables.warehouse": f"{table_bucket_arn}-modified", "s3tables.region": "us-east-1"} + with pytest.raises(TableBucketNotFound): + S3TablesCatalog(name="test_s3tables_catalog", **properties) + + +def test_create_namespace(catalog: S3TablesCatalog, database_name: str) -> None: + catalog.create_namespace(namespace=database_name) + namespaces = catalog.list_namespaces() + assert (database_name,) in namespaces + + +def test_load_namespace_properties(catalog: S3TablesCatalog, database_name: str) -> None: + catalog.create_namespace(namespace=database_name) + assert database_name in catalog.load_namespace_properties(database_name)["namespace"] + + +def test_drop_namespace(catalog: S3TablesCatalog, database_name: str) -> None: + catalog.create_namespace(namespace=database_name) + assert (database_name,) in catalog.list_namespaces() + catalog.drop_namespace(namespace=database_name) + assert (database_name,) not in catalog.list_namespaces() + + +def test_create_table(catalog: S3TablesCatalog, database_name: str, table_name: str, table_schema_nested: Schema) -> None: + identifier = (database_name, table_name) + + catalog.create_namespace(namespace=database_name) + table = catalog.create_table(identifier=identifier, schema=table_schema_nested) + + assert table == catalog.load_table(identifier) + + +def test_create_table_in_invalid_namespace_raises_exception( + catalog: S3TablesCatalog, database_name: str, table_name: str, table_schema_nested: Schema +) -> None: + identifier = (database_name, table_name) + + with pytest.raises(NoSuchNamespaceError): + catalog.create_table(identifier=identifier, schema=table_schema_nested) + + +def test_table_exists(catalog: S3TablesCatalog, database_name: str, table_name: str, table_schema_nested: Schema) -> None: + identifier = (database_name, table_name) + + catalog.create_namespace(namespace=database_name) + assert not catalog.table_exists(identifier=identifier) + catalog.create_table(identifier=identifier, schema=table_schema_nested) + assert catalog.table_exists(identifier=identifier) + + +def test_rename_table(catalog: S3TablesCatalog, database_name: str, table_name: str, table_schema_nested: Schema) -> None: + identifier = (database_name, table_name) + + catalog.create_namespace(namespace=database_name) + catalog.create_table(identifier=identifier, schema=table_schema_nested) + + to_database_name = f"{database_name}new" + to_table_name = f"{table_name}new" + to_identifier = (to_database_name, to_table_name) + catalog.create_namespace(namespace=to_database_name) + catalog.rename_table(from_identifier=identifier, to_identifier=to_identifier) + + assert not catalog.table_exists(identifier=identifier) + assert catalog.table_exists(identifier=to_identifier) + + +def test_list_tables(catalog: S3TablesCatalog, database_name: str, table_name: str, table_schema_nested: Schema) -> None: + identifier = (database_name, table_name) + + catalog.create_namespace(namespace=database_name) + assert not catalog.list_tables(namespace=database_name) + catalog.create_table(identifier=identifier, schema=table_schema_nested) + assert catalog.list_tables(namespace=database_name) + + +def test_drop_table(catalog: S3TablesCatalog, database_name: str, table_name: str, table_schema_nested: Schema) -> None: + identifier = (database_name, table_name) + + catalog.create_namespace(namespace=database_name) + catalog.create_table(identifier=identifier, schema=table_schema_nested) + + catalog.drop_table(identifier=identifier) + + with pytest.raises(NoSuchTableError): + catalog.load_table(identifier=identifier) + + +def test_commit_new_column_to_table( + catalog: S3TablesCatalog, database_name: str, table_name: str, table_schema_nested: Schema +) -> None: + identifier = (database_name, table_name) + + catalog.create_namespace(namespace=database_name) + table = catalog.create_table(identifier=identifier, schema=table_schema_nested) + + last_updated_ms = table.metadata.last_updated_ms + original_table_metadata_location = table.metadata_location + original_table_last_updated_ms = table.metadata.last_updated_ms + + transaction = table.transaction() + update = transaction.update_schema() + update.add_column(path="b", field_type=IntegerType()) + update.commit() + transaction.commit_transaction() + + updated_table_metadata = table.metadata + assert updated_table_metadata.current_schema_id == 1 + assert len(updated_table_metadata.schemas) == 2 + assert updated_table_metadata.last_updated_ms > last_updated_ms + assert updated_table_metadata.metadata_log[0].metadata_file == original_table_metadata_location + assert updated_table_metadata.metadata_log[0].timestamp_ms == original_table_last_updated_ms + assert table.schema().columns[-1].name == "b" + + +def test_write_pyarrow_table(catalog: S3TablesCatalog, database_name: str, table_name: str) -> None: + identifier = (database_name, table_name) + catalog.create_namespace(namespace=database_name) + + import pyarrow as pa + + pyarrow_table = pa.Table.from_arrays( + [ + pa.array([None, "A", "B", "C"]), # 'foo' column + pa.array([1, 2, 3, 4]), # 'bar' column + pa.array([True, None, False, True]), # 'baz' column + pa.array([None, "A", "B", "C"]), # 'large' column + ], + schema=pa.schema( + [ + pa.field("foo", pa.large_string(), nullable=True), + pa.field("bar", pa.int32(), nullable=False), + pa.field("baz", pa.bool_(), nullable=True), + pa.field("large", pa.large_string(), nullable=True), + ] + ), + ) + table = catalog.create_table(identifier=identifier, schema=pyarrow_table.schema) + table.append(pyarrow_table) + + assert table.scan().to_arrow().num_rows == pyarrow_table.num_rows + + +def test_commit_new_data_to_table(catalog: S3TablesCatalog, database_name: str, table_name: str) -> None: + identifier = (database_name, table_name) + catalog.create_namespace(namespace=database_name) + + import pyarrow as pa + + pyarrow_table = pa.Table.from_arrays( + [ + pa.array([None, "A", "B", "C"]), # 'foo' column + pa.array([1, 2, 3, 4]), # 'bar' column + pa.array([True, None, False, True]), # 'baz' column + pa.array([None, "A", "B", "C"]), # 'large' column + ], + schema=pa.schema( + [ + pa.field("foo", pa.large_string(), nullable=True), + pa.field("bar", pa.int32(), nullable=False), + pa.field("baz", pa.bool_(), nullable=True), + pa.field("large", pa.large_string(), nullable=True), + ] + ), + ) + + table = catalog.create_table(identifier=identifier, schema=pyarrow_table.schema) + table.append(pyarrow_table) + + row_count = table.scan().to_arrow().num_rows + assert row_count + last_updated_ms = table.metadata.last_updated_ms + original_table_metadata_location = table.metadata_location + original_table_last_updated_ms = table.metadata.last_updated_ms + + transaction = table.transaction() + transaction.append(table.scan().to_arrow()) + transaction.commit_transaction() + + updated_table_metadata = table.metadata + assert updated_table_metadata.last_updated_ms > last_updated_ms + assert updated_table_metadata.metadata_log[-1].metadata_file == original_table_metadata_location + assert updated_table_metadata.metadata_log[-1].timestamp_ms == original_table_last_updated_ms + assert table.scan().to_arrow().num_rows == 2 * row_count + + +@pytest.mark.xfail(raises=NotImplementedError) +def test_create_table_transaction( + catalog: S3TablesCatalog, database_name: str, table_name: str, table_schema_nested: str +) -> None: + identifier = (database_name, table_name) + catalog.create_namespace(namespace=database_name) + + with catalog.create_table_transaction( + identifier, + table_schema_nested, + partition_spec=PartitionSpec(PartitionField(source_id=1, field_id=1000, transform=IdentityTransform(), name="foo")), + ) as txn: + last_updated_metadata = txn.table_metadata.last_updated_ms + with txn.update_schema() as update_schema: + update_schema.add_column(path="b", field_type=IntegerType()) + + with txn.update_spec() as update_spec: + update_spec.add_identity("bar") + + txn.set_properties(test_a="test_aa", test_b="test_b", test_c="test_c") + + table = catalog.load_table(identifier) + + assert table.schema().find_field("b").field_type == IntegerType() + assert table.properties == {"test_a": "test_aa", "test_b": "test_b", "test_c": "test_c"} + assert table.spec().last_assigned_field_id == 1001 + assert table.spec().fields_by_source_id(1)[0].name == "foo" + assert table.spec().fields_by_source_id(1)[0].field_id == 1000 + assert table.spec().fields_by_source_id(1)[0].transform == IdentityTransform() + assert table.spec().fields_by_source_id(2)[0].name == "bar" + assert table.spec().fields_by_source_id(2)[0].field_id == 1001 + assert table.spec().fields_by_source_id(2)[0].transform == IdentityTransform() + assert table.metadata.last_updated_ms > last_updated_metadata From 5b0e622087d0fe40e5c216bd2c35683ec4bbbcee Mon Sep 17 00:00:00 2001 From: Felix Scherz Date: Sun, 2 Feb 2025 13:54:49 +0100 Subject: [PATCH 59/60] docs: add s3tables catalog --- mkdocs/docs/configuration.md | 98 ++++++++++++++++++++++++++++++++++++ 1 file changed, 98 insertions(+) diff --git a/mkdocs/docs/configuration.md b/mkdocs/docs/configuration.md index 3705be5d35..912582b8c1 100644 --- a/mkdocs/docs/configuration.md +++ b/mkdocs/docs/configuration.md @@ -524,6 +524,104 @@ catalog: +### S3Tables Catalog + +The S3Tables Catalog leverages the catalog functionalities of the Amazon S3Tables service and requires an existing S3 Tables Bucket to operate. + +To use Amazon S3Tables as your catalog, you can configure pyiceberg using one of the following methods. Additionally, refer to the [AWS documentation](https://docs.aws.amazon.com/cli/latest/userguide/cli-chap-configure.html) on configuring credentials to set up your AWS account credentials locally. + +If you intend to use the same credentials for both the S3Tables Catalog and S3 FileIO, you can configure the [`client.*` properties](configuration.md#unified-aws-credentials) to streamline the process. + +Note that the S3Tables Catalog manages the underlying table locations internally, which makes it incompatible with S3-like storage systems such as MinIO. If you specify the `s3tables.endpoint`, ensure that the `s3.endpoint` is configured accordingly. + +```yaml +catalog: + default: + type: s3tables + warehouse: arn:aws:s3tables:us-east-1:012345678901:bucket/pyiceberg-catalog +``` + +If you prefer to pass the credentials explicitly to the client instead of relying on environment variables, + +```yaml +catalog: + default: + type: s3tables + s3tables.access-key-id: + s3tables.secret-access-key: + s3tables.session-token: + s3tables.region: + s3tables.endpoint: http://localhost:9000 + s3.endpoint: http://localhost:9000 +``` + + + +!!! Note "Client-specific Properties" + `s3tables.*` properties are for S3TablesCatalog only. If you want to use the same credentials for both S3TablesCatalog and S3 FileIO, you can set the `client.*` properties. See the [Unified AWS Credentials](configuration.md#unified-aws-credentials) section for more details. + + + + + +| Key | Example | Description | +| -------------------------- | ------------------- | -------------------------------------------------------------------------- | +| s3tables.profile-name | default | Configure the static profile used to access the S3Tables Catalog | +| s3tables.region | us-east-1 | Set the region of the S3Tables Catalog | +| s3tables.access-key-id | admin | Configure the static access key id used to access the S3Tables Catalog | +| s3tables.secret-access-key | password | Configure the static secret access key used to access the S3Tables Catalog | +| s3tables.session-token | AQoDYXdzEJr... | Configure the static session token used to access the S3Tables Catalog | +| s3tables.endpoint | ... | Configure the AWS endpoint | +| s3tables.warehouse | arn:aws:s3tables... | Set the underlying S3 Table Bucket | + + + + + +!!! warning "Removed Properties" + The properties `profile_name`, `region_name`, `aws_access_key_id`, `aws_secret_access_key`, and `aws_session_token` were deprecated and removed in 0.8.0 + + + +An example usage of the S3Tables Catalog is shown below: + +```python +from pyiceberg.catalog.s3tables import S3TablesCatalog +import pyarrow as pa + + +table_bucket_arn: str = "..." +aws_region: str = "..." + +properties = {"s3tables.warehouse": table_bucket_arn, "s3tables.region": aws_region} +catalog = S3TablesCatalog(name="s3tables_catalog", **properties) + +database_name = "prod" + +catalog.create_namespace(namespace=database_name) + +pyarrow_table = pa.Table.from_arrays( + [ + pa.array([None, "A", "B", "C"]), + pa.array([1, 2, 3, 4]), + pa.array([True, None, False, True]), + pa.array([None, "A", "B", "C"]), + ], + schema=pa.schema( + [ + pa.field("foo", pa.large_string(), nullable=True), + pa.field("bar", pa.int32(), nullable=False), + pa.field("baz", pa.bool_(), nullable=True), + pa.field("large", pa.large_string(), nullable=True), + ] + ), +) + +identifier = (database_name, "orders") +table = catalog.create_table(identifier=identifier, schema=pyarrow_table.schema) +table.append(pyarrow_table) +``` + ### Custom Catalog Implementations If you want to load any custom catalog implementation, you can set catalog configurations like the following: From f30b7e659cac25507ffa8fc2df1e7a85fa282115 Mon Sep 17 00:00:00 2001 From: Felix Scherz Date: Mon, 3 Feb 2025 09:05:34 +0100 Subject: [PATCH 60/60] chore: bump moto library --- poetry.lock | 8 ++++---- pyproject.toml | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/poetry.lock b/poetry.lock index 4a695895b5..68fda796c1 100644 --- a/poetry.lock +++ b/poetry.lock @@ -2584,13 +2584,13 @@ type = ["mypy (==1.14.1)"] [[package]] name = "moto" -version = "5.0.27" +version = "5.0.28" description = "A library that allows you to easily mock out tests based on AWS infrastructure" optional = false python-versions = ">=3.8" files = [ - {file = "moto-5.0.27-py3-none-any.whl", hash = "sha256:27042fd94c8def0166d9f2ae8d39d9488d4b3115542b5fca88566c0424549013"}, - {file = "moto-5.0.27.tar.gz", hash = "sha256:6c123de7e0e5e6508a10c399ba3ecf2d5143f263f8e804fd4a7091941c3f5207"}, + {file = "moto-5.0.28-py3-none-any.whl", hash = "sha256:2dfbea1afe3b593e13192059a1a7fc4b3cf7fdf92e432070c22346efa45aa0f0"}, + {file = "moto-5.0.28.tar.gz", hash = "sha256:4d3437693411ec943c13c77de5b0b520c4b0a9ac850fead4ba2a54709e086e8b"}, ] [package.dependencies] @@ -5359,4 +5359,4 @@ zstandard = ["zstandard"] [metadata] lock-version = "2.0" python-versions = "^3.9, !=3.9.7" -content-hash = "95b2b91331fa7da405234dfb437881af27a4f0eed987628397c8b0428707636f" +content-hash = "a121d29b25a7e835b1fb226ac804861bf308070b4bf4eab221cd23405059d9fc" diff --git a/pyproject.toml b/pyproject.toml index dfa6f286c7..6e940e0058 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -89,7 +89,7 @@ pre-commit = "4.1.0" fastavro = "1.10.0" coverage = { version = "^7.4.2", extras = ["toml"] } requests-mock = "1.12.1" -moto = { version = "^5.0.2", extras = ["server"] } +moto = { version = "^5.0.28", extras = ["server"] } typing-extensions = "4.12.2" pytest-mock = "3.14.0" pyspark = "3.5.3"