dlt-hub · rudolfix · Nov 10, 2024 · Nov 10, 2024 · Nov 10, 2024 · Nov 12, 2024
diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml
@@ -59,7 +59,7 @@ jobs:
 
       - name: Install dependencies
         # if: steps.cached-poetry-dependencies.outputs.cache-hit != 'true'
-        run: poetry install --all-extras --with airflow,providers,pipeline,sentry-sdk,dbt
+        run: poetry install --all-extras --with airflow,providers,pipeline,dbt,sentry-sdk
 
       - name: Run make lint
         run: |

diff --git a/.github/workflows/test_common.yml b/.github/workflows/test_common.yml
@@ -45,6 +45,10 @@ jobs:
             os: "ubuntu-latest"
           - python-version: "3.12.x"
             os: "ubuntu-latest"
+          - python-version: "3.13.x"
+            os: "ubuntu-latest"
+          - python-version: "3.12.x"
+            os: "windows-latest"
 
     defaults:
       run:
@@ -80,7 +84,7 @@ jobs:
           virtualenvs-in-project: true
           installer-parallel: true
 
-      # NOTE: do not cache. we want to have a clean state each run and we upgrade depdendencies later
+      # NOTE: do not cache. we want to have a clean state each run and we upgrade dependencies later
       # - name: Load cached venv
       #   id: cached-poetry-dependencies
       #   uses: actions/cache@v3
@@ -116,7 +120,7 @@ jobs:
         shell: cmd
 
       - name: Install pyarrow
-        run: poetry install --no-interaction -E duckdb -E cli -E parquet --with sentry-sdk && poetry run pip install pyarrow==15.0.2
+        run: poetry install --no-interaction -E duckdb -E cli -E parquet --with sentry-sdk # && poetry run pip install pyarrow==15.0.2
 
       - run: |
           poetry run pytest tests/pipeline/test_pipeline_extra.py -k arrow
@@ -129,7 +133,7 @@ jobs:
         shell: cmd
 
       - name: Install pipeline and sources dependencies
-        run: poetry install --no-interaction -E duckdb -E cli -E parquet -E deltalake -E sql_database --with sentry-sdk,pipeline,sources && poetry run pip install pyarrow==15.0.2
+        run: poetry install --no-interaction -E duckdb -E cli -E parquet -E deltalake -E sql_database --with sentry-sdk,pipeline,sources # && poetry run pip install pyarrow==15.0.2
 
       - run: |
           poetry run pytest tests/extract tests/pipeline tests/libs tests/cli/common tests/destinations tests/sources
@@ -156,18 +160,18 @@ jobs:
         shell: cmd
 
       # here we upgrade pyarrow to 17 and run the libs tests again
-      - name: Install pyarrow 17
-        run: poetry run pip install pyarrow==17.0.0
+      # - name: Install pyarrow 17
+      #   run: poetry run pip install pyarrow==17.0.0
 
-      - run: |
-          poetry run pytest tests/libs
-        if: runner.os != 'Windows'
-        name: Run libs tests Linux/MAC
-      - run: |
-          poetry run pytest tests/libs
-        if: runner.os == 'Windows'
-        name: Run libs tests Windows
-        shell: cmd
+      # - run: |
+      #     poetry run pytest tests/libs
+      #   if: runner.os != 'Windows'
+      #   name: Run libs tests Linux/MAC
+      # - run: |
+      #     poetry run pytest tests/libs
+      #   if: runner.os == 'Windows'
+      #   name: Run libs tests Windows
+      #   shell: cmd
 
       # - name: Install Pydantic 1.0
       #   run: pip install "pydantic<2"

diff --git a/.github/workflows/test_destination_athena.yml b/.github/workflows/test_destination_athena.yml
@@ -67,7 +67,9 @@ jobs:
 
       - name: Install dependencies
         # if: steps.cached-poetry-dependencies.outputs.cache-hit != 'true'
-        run: poetry install --no-interaction -E athena --with sentry-sdk --with pipeline,ibis
+        run: |
+          poetry install --no-interaction -E athena --with sentry-sdk --with pipeline
+          poetry run pip install ibis-framework[duckdb,postgres,bigquery,snowflake,mssql,clickhouse]
 
       - name: create secrets.toml
         run: pwd && echo "$DLT_SECRETS_TOML" > tests/.dlt/secrets.toml

diff --git a/.github/workflows/test_destination_athena_iceberg.yml b/.github/workflows/test_destination_athena_iceberg.yml
@@ -67,7 +67,9 @@ jobs:
 
       - name: Install dependencies
         # if: steps.cached-poetry-dependencies.outputs.cache-hit != 'true'
-        run: poetry install --no-interaction -E athena --with sentry-sdk --with pipeline,ibis
+        run: |
+          poetry install --no-interaction -E athena --with sentry-sdk --with pipeline
+          poetry run pip install ibis-framework[duckdb,postgres,bigquery,snowflake,mssql,clickhouse]
 
       - name: create secrets.toml
         run: pwd && echo "$DLT_SECRETS_TOML" > tests/.dlt/secrets.toml

diff --git a/.github/workflows/test_destination_bigquery.yml b/.github/workflows/test_destination_bigquery.yml
@@ -66,7 +66,9 @@ jobs:
 
       - name: Install dependencies
         # if: steps.cached-poetry-dependencies.outputs.cache-hit != 'true'
-        run: poetry install --no-interaction -E bigquery --with providers -E parquet --with sentry-sdk --with pipeline,ibis
+        run: |
+          poetry install --no-interaction -E bigquery --with providers -E parquet --with sentry-sdk --with pipeline
+          poetry run pip install ibis-framework[duckdb,postgres,bigquery,snowflake,mssql,clickhouse]
 
       - name: create secrets.toml
         run: pwd && echo "$DLT_SECRETS_TOML" > tests/.dlt/secrets.toml

diff --git a/.github/workflows/test_destination_clickhouse.yml b/.github/workflows/test_destination_clickhouse.yml
@@ -58,10 +58,12 @@ jobs:
         uses: actions/cache@v3
         with:
           path: .venv
-          key: venv-${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-${{ hashFiles('**/poetry.lock') }}-gcp
+          key: venv-${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-${{ hashFiles('**/poetry.lock') }}-clickhouse
 
       - name: Install dependencies
-        run: poetry install --no-interaction -E clickhouse --with providers -E parquet --with sentry-sdk --with pipeline,ibis
+        run: |
+          poetry install --no-interaction -E clickhouse --with providers -E parquet --with sentry-sdk --with pipeline
+          poetry run pip install ibis-framework[duckdb,postgres,bigquery,snowflake,mssql,clickhouse]
 
       - name: create secrets.toml
         run: pwd && echo "$DLT_SECRETS_TOML" > tests/.dlt/secrets.toml

diff --git a/.github/workflows/test_destination_databricks.yml b/.github/workflows/test_destination_databricks.yml
@@ -61,10 +61,12 @@ jobs:
         uses: actions/cache@v3
         with:
           path: .venv
-          key: venv-${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-${{ hashFiles('**/poetry.lock') }}-gcp
+          key: venv-${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-${{ hashFiles('**/poetry.lock') }}-databricks
 
       - name: Install dependencies
-        run: poetry install --no-interaction -E databricks -E s3 -E gs -E az -E parquet --with sentry-sdk --with pipeline,ibis
+        run: |
+          poetry install --no-interaction -E databricks -E s3 -E gs -E az -E parquet --with sentry-sdk --with pipeline
+          poetry run pip install ibis-framework[duckdb,postgres,bigquery,snowflake,mssql,clickhouse]
 
       - name: create secrets.toml
         run: pwd && echo "$DLT_SECRETS_TOML" > tests/.dlt/secrets.toml

diff --git a/.github/workflows/test_destination_dremio.yml b/.github/workflows/test_destination_dremio.yml
@@ -62,10 +62,12 @@ jobs:
         uses: actions/cache@v3
         with:
           path: .venv
-          key: venv-${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-${{ hashFiles('**/poetry.lock') }}-gcp
+          key: venv-${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-${{ hashFiles('**/poetry.lock') }}-dremio
 
       - name: Install dependencies
-        run: poetry install --no-interaction -E s3 -E gs -E az -E parquet --with sentry-sdk --with pipeline,ibis
+        run: |
+          poetry install --no-interaction -E s3 -E gs -E az -E parquet --with sentry-sdk --with pipeline
+          poetry run pip install ibis-framework[duckdb,postgres,bigquery,snowflake,mssql,clickhouse]
 
       - run: |
           poetry run pytest tests/load --ignore tests/load/sources

diff --git a/.github/workflows/test_destination_lancedb.yml b/.github/workflows/test_destination_lancedb.yml
@@ -59,7 +59,7 @@ jobs:
         uses: actions/cache@v3
         with:
           path: .venv
-          key: venv-${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-${{ hashFiles('**/poetry.lock') }}-gcp
+          key: venv-${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-${{ hashFiles('**/poetry.lock') }}-lance
 
       - name: create secrets.toml
         run: pwd && echo "$DLT_SECRETS_TOML" > tests/.dlt/secrets.toml

diff --git a/.github/workflows/test_destination_motherduck.yml b/.github/workflows/test_destination_motherduck.yml
@@ -64,7 +64,9 @@ jobs:
           key: venv-${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-${{ hashFiles('**/poetry.lock') }}-motherduck
 
       - name: Install dependencies
-        run: poetry install --no-interaction -E motherduck -E s3 -E gs -E az -E parquet --with sentry-sdk --with pipeline,ibis
+        run: |
+          poetry install --no-interaction -E motherduck -E s3 -E gs -E az -E parquet --with sentry-sdk --with pipeline
+          poetry run pip install ibis-framework[duckdb,postgres,bigquery,snowflake,mssql,clickhouse]
 
       - name: create secrets.toml
         run: pwd && echo "$DLT_SECRETS_TOML" > tests/.dlt/secrets.toml

diff --git a/.github/workflows/test_destination_mssql.yml b/.github/workflows/test_destination_mssql.yml
@@ -66,10 +66,12 @@ jobs:
         uses: actions/cache@v3
         with:
           path: .venv
-          key: venv-${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-${{ hashFiles('**/poetry.lock') }}-gcp
+          key: venv-${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-${{ hashFiles('**/poetry.lock') }}-mssql
 
       - name: Install dependencies
-        run: poetry install --no-interaction -E mssql -E s3 -E gs -E az -E parquet --with sentry-sdk --with pipeline,ibis
+        run: |
+          poetry install --no-interaction -E mssql -E s3 -E gs -E az -E parquet --with sentry-sdk --with pipeline
+          poetry run pip install ibis-framework[duckdb,postgres,bigquery,snowflake,mssql,clickhouse]
 
       - name: create secrets.toml
         run: pwd && echo "$DLT_SECRETS_TOML" > tests/.dlt/secrets.toml

diff --git a/.github/workflows/test_destination_qdrant.yml b/.github/workflows/test_destination_qdrant.yml
@@ -60,7 +60,7 @@ jobs:
         uses: actions/cache@v3
         with:
           path: .venv
-          key: venv-${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-${{ hashFiles('**/poetry.lock') }}-gcp
+          key: venv-${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-${{ hashFiles('**/poetry.lock') }}-qdrant
 
       - name: create secrets.toml
         run: pwd && echo "$DLT_SECRETS_TOML" > tests/.dlt/secrets.toml

diff --git a/.github/workflows/test_destination_snowflake.yml b/.github/workflows/test_destination_snowflake.yml
@@ -61,10 +61,12 @@ jobs:
         uses: actions/cache@v3
         with:
           path: .venv
-          key: venv-${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-${{ hashFiles('**/poetry.lock') }}-gcp
+          key: venv-${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-${{ hashFiles('**/poetry.lock') }}-snow
 
       - name: Install dependencies
-        run: poetry install --no-interaction -E snowflake -E s3 -E gs -E az -E parquet --with sentry-sdk --with pipeline,ibis
+        run: |
+          poetry install --no-interaction -E snowflake -E s3 -E gs -E az -E parquet --with sentry-sdk --with pipeline
+          poetry run pip install ibis-framework[duckdb,postgres,bigquery,snowflake,mssql,clickhouse]
 
       - name: create secrets.toml
         run: pwd && echo "$DLT_SECRETS_TOML" > tests/.dlt/secrets.toml

diff --git a/.github/workflows/test_destination_synapse.yml b/.github/workflows/test_destination_synapse.yml
@@ -64,10 +64,12 @@ jobs:
         uses: actions/cache@v3
         with:
           path: .venv
-          key: venv-${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-${{ hashFiles('**/poetry.lock') }}-gcp
+          key: venv-${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-${{ hashFiles('**/poetry.lock') }}-mssql
 
       - name: Install dependencies
-        run: poetry install --no-interaction -E synapse -E parquet --with sentry-sdk --with pipeline,ibis
+        run: |
+          poetry install --no-interaction -E synapse -E parquet --with sentry-sdk --with pipeline
+          poetry run pip install ibis-framework[duckdb,postgres,bigquery,snowflake,mssql,clickhouse]
 
       - name: create secrets.toml
         run: pwd && echo "$DLT_SECRETS_TOML" > tests/.dlt/secrets.toml

diff --git a/.github/workflows/test_destinations.yml b/.github/workflows/test_destinations.yml
@@ -77,7 +77,9 @@ jobs:
       #     key: venv-${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-${{ hashFiles('**/poetry.lock') }}-redshift
 
       - name: Install dependencies
-        run: poetry install --no-interaction -E redshift -E postgis -E postgres -E gs -E s3 -E az -E parquet -E duckdb -E cli -E filesystem --with sentry-sdk --with pipeline,ibis -E deltalake -E pyiceberg
+        run: |
+          poetry install --no-interaction -E redshift -E postgis -E postgres -E gs -E s3 -E az -E parquet -E duckdb -E cli -E filesystem --with sentry-sdk --with pipeline -E deltalake -E pyiceberg
+          poetry run pip install ibis-framework[duckdb,postgres,bigquery,snowflake,mssql,clickhouse]
 
       - name: Upgrade sqlalchemy
         run: poetry run pip install sqlalchemy==2.0.18  # minimum version required by `pyiceberg`

diff --git a/.github/workflows/test_doc_snippets.yml b/.github/workflows/test_doc_snippets.yml
@@ -84,7 +84,7 @@ jobs:
         uses: actions/cache@v3
         with:
           path: .venv
-          key: venv-${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-${{ hashFiles('**/poetry.lock') }}
+          key: venv-${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-${{ hashFiles('**/poetry.lock') }}-docs
 
       - name: run docs preprocessor
         run: make preprocess-docs

diff --git a/.github/workflows/test_local_destinations.yml b/.github/workflows/test_local_destinations.yml
@@ -95,7 +95,9 @@ jobs:
           key: venv-${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-${{ hashFiles('**/poetry.lock') }}-local-destinations
 
       - name: Install dependencies
-        run: poetry install --no-interaction -E postgres -E postgis -E duckdb -E parquet -E filesystem -E cli -E weaviate -E qdrant -E sftp --with sentry-sdk --with pipeline,ibis -E deltalake -E pyiceberg
+        run: |
+          poetry install --no-interaction -E postgres -E postgis -E duckdb -E parquet -E filesystem -E cli -E weaviate -E qdrant -E sftp --with sentry-sdk --with pipeline -E deltalake -E pyiceberg
+          poetry run pip install ibis-framework[duckdb,postgres,bigquery,snowflake,mssql,clickhouse]
 
       - name: Upgrade sqlalchemy
         run: poetry run pip install sqlalchemy==2.0.18  # minimum version required by `pyiceberg`

diff --git a/.github/workflows/test_sqlalchemy_destinations.yml b/.github/workflows/test_sqlalchemy_destinations.yml
@@ -83,10 +83,14 @@ jobs:
         uses: actions/cache@v3
         with:
           path: .venv
-          key: venv-${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-${{ hashFiles('**/poetry.lock') }}-local-destinations
+          key: venv-${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-${{ hashFiles('**/poetry.lock') }}-sql-alchemy
 
       - name: Install dependencies
-        run: poetry install --no-interaction -E parquet -E filesystem -E sqlalchemy -E cli --with sentry-sdk --with pipeline,ibis && poetry run pip install mysqlclient && poetry run pip install "sqlalchemy==${{ matrix.sqlalchemy }}"
+        run: |
+          poetry install --no-interaction -E parquet -E filesystem -E sqlalchemy -E cli --with sentry-sdk --with pipeline
+          poetry run pip install mysqlclient
+          poetry run pip install "sqlalchemy==${{ matrix.sqlalchemy }}"
+          poetry run pip install ibis-framework[duckdb,postgres,bigquery,snowflake,mssql,clickhouse]
 
       - name: create secrets.toml
         run: pwd && echo "$DLT_SECRETS_TOML" > tests/.dlt/secrets.toml

diff --git a/Makefile b/Makefile
@@ -46,6 +46,9 @@ has-poetry:
 dev: has-poetry
 	poetry install --all-extras --with docs,providers,pipeline,sources,sentry-sdk
 
+dev-common: has-poetry
+	poetry install -E duckdb -E cli -E parquet -E deltalake -E sql_database --with sentry-sdk,pipeline,sources
+
 lint:
 	./tools/check-package.sh
 	poetry run python ./tools/check-lockfile.py

diff --git a/dlt/common/libs/pyarrow.py b/dlt/common/libs/pyarrow.py
@@ -135,6 +135,21 @@ def get_pyarrow_int(precision: Optional[int]) -> Any:
     return pyarrow.int64()
 
 
+# def minimal_integer_type(max_value: int, signed: bool = True) -> pyarrow.DataType:
+#     num_bits = max_value.bit_length() or 1  # Ensure at least 1 bit
+#     if signed:
+#         num_bits += 1  # Add 1 bit for the sign
+
+#     # Standard bit widths for integer types
+#     bit_widths = [8, 16, 32, 64]
+
+#     # Find the minimal bit width that can accommodate num_bits
+#     bit_width = next((bw for bw in bit_widths if num_bits <= bw), 64)
+
+#     # Create the integer type using pa.int_()
+#     return pyarrow.int_(bit_width, signed=signed)
+
+
 def get_column_type_from_py_arrow(dtype: pyarrow.DataType) -> TColumnType:
     """Returns (data_type, precision, scale) tuple from pyarrow.DataType"""
     if pyarrow.types.is_string(dtype) or pyarrow.types.is_large_string(dtype):
@@ -594,7 +609,7 @@ def row_tuples_to_arrow(
         logger.info(
             "Pandas not installed, reverting to numpy.asarray to create a table which is slower"
         )
-        pivoted_rows = np.asarray(rows, dtype="object", order="k").T  # type: ignore[call-overload]
+        pivoted_rows = np.asarray(rows, dtype="object", order="k").T  # type: ignore[call-overload,unused-ignore]
 
     columnar = {
         col: dat.ravel() for col, dat in zip(columns, np.vsplit(pivoted_rows, len(pivoted_rows)))

diff --git a/dlt/common/typing.py b/dlt/common/typing.py
@@ -332,7 +332,7 @@ def is_typeddict(t: Type[Any]) -> bool:
 
 def is_annotated(ann_type: Any) -> bool:
     try:
-        return issubclass(get_origin(ann_type), Annotated)  # type: ignore[arg-type]
+        return get_origin(ann_type) is Annotated
     except TypeError:
         return False
 

diff --git a/dlt/destinations/impl/databricks/databricks.py b/dlt/destinations/impl/databricks/databricks.py
@@ -29,10 +29,10 @@
 from dlt.common.schema.typing import TColumnType
 from dlt.common.storages import FilesystemConfiguration, fsspec_from_config
 
-from dlt.destinations.insert_job_client import InsertValuesJobClient
 from dlt.destinations.exceptions import LoadJobTerminalException
 from dlt.destinations.impl.databricks.configuration import DatabricksClientConfiguration
 from dlt.destinations.impl.databricks.sql_client import DatabricksSqlClient
+from dlt.destinations.job_client_impl import SqlJobClientWithStagingDataset
 from dlt.destinations.sql_jobs import SqlMergeFollowupJob
 from dlt.destinations.job_impl import ReferenceFollowupJobRequest
 from dlt.destinations.utils import is_compression_disabled
@@ -198,7 +198,7 @@ def gen_delete_from_sql(
         """
 
 
-class DatabricksClient(InsertValuesJobClient, SupportsStagingDestination):
+class DatabricksClient(SqlJobClientWithStagingDataset, SupportsStagingDestination):
     def __init__(
         self,
         schema: Schema,
@@ -213,7 +213,7 @@ def __init__(
         )
         super().__init__(schema, config, sql_client)
         self.config: DatabricksClientConfiguration = config
-        self.sql_client: DatabricksSqlClient = sql_client
+        self.sql_client: DatabricksSqlClient = sql_client  # type: ignore[assignment, unused-ignore]
         self.type_mapper = self.capabilities.get_type_mapper()
 
     def create_load_job(

diff --git a/dlt/destinations/impl/databricks/sql_client.py b/dlt/destinations/impl/databricks/sql_client.py
@@ -16,7 +16,7 @@
 )
 
 from databricks.sdk.core import Config, oauth_service_principal
-from databricks import sql as databricks_lib  # type: ignore[attr-defined]
+from databricks import sql as databricks_lib  # type: ignore[attr-defined, unused-ignore]
 from databricks.sql.client import (
     Connection as DatabricksSqlConnection,
     Cursor as DatabricksSqlCursor,
@@ -43,7 +43,7 @@
 class DatabricksCursorImpl(DBApiCursorImpl):
     """Use native data frame support if available"""
 
-    native_cursor: DatabricksSqlCursor
+    native_cursor: DatabricksSqlCursor  # type: ignore[assignment, unused-ignore]
     vector_size: ClassVar[int] = 2048  # vector size is 2048
 
     def iter_arrow(self, chunk_size: int) -> Generator[ArrowTable, None, None]:
@@ -140,7 +140,7 @@ def execute_sql(
     @contextmanager
     @raise_database_error
     def execute_query(self, query: AnyStr, *args: Any, **kwargs: Any) -> Iterator[DBApiCursor]:
-        curr: DBApiCursor
+        # curr: DBApiCursor
         # TODO: Inline param support will be dropped in future databricks driver, switch to :named paramstyle
         # This will drop support for cluster runtime v13.x
         # db_args: Optional[Dict[str, Any]]
@@ -158,11 +158,11 @@ def execute_query(self, query: AnyStr, *args: Any, **kwargs: Any) -> Iterator[DB
         #         db_args[key] = db_arg
         # else:
         #     db_args = kwargs or None
-
+        assert isinstance(query, str)
         db_args = args or kwargs or None
         with self._conn.cursor() as curr:
             curr.execute(query, db_args)
-            yield DatabricksCursorImpl(curr)  # type: ignore[abstract]
+            yield DatabricksCursorImpl(curr)  # type: ignore[arg-type, abstract, unused-ignore]
 
     def catalog_name(self, escape: bool = True) -> Optional[str]:
         catalog = self.capabilities.casefold_identifier(self.credentials.catalog)