diff --git a/.github/workflows/test_common.yml b/.github/workflows/test_common.yml
index 674b38a776..359ed43095 100644
--- a/.github/workflows/test_common.yml
+++ b/.github/workflows/test_common.yml
@@ -18,6 +18,8 @@ env:
# we need the secrets only for the rest_api_pipeline tests which are in tests/sources
# so we inject them only at the end
SOURCES__GITHUB__ACCESS_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+ # and also for the github_api_pipeline tests
+ SOURCES__GITHUB_API_PIPELINE__ACCESS_TOKEN: ${{ secrets.GITHUB_TOKEN }}
jobs:
get_docs_changes:
@@ -114,7 +116,7 @@ jobs:
shell: cmd
- name: Install pyarrow
- run: poetry install --no-interaction -E duckdb -E cli -E parquet --with sentry-sdk
+ run: poetry install --no-interaction -E duckdb -E cli -E parquet --with sentry-sdk && poetry run pip install pyarrow==15.0.2
- run: |
poetry run pytest tests/pipeline/test_pipeline_extra.py -k arrow
@@ -127,7 +129,7 @@ jobs:
shell: cmd
- name: Install pipeline and sources dependencies
- run: poetry install --no-interaction -E duckdb -E cli -E parquet -E deltalake -E sql_database --with sentry-sdk,pipeline,sources
+ run: poetry install --no-interaction -E duckdb -E cli -E parquet -E deltalake -E sql_database --with sentry-sdk,pipeline,sources && poetry run pip install pyarrow==15.0.2
- run: |
poetry run pytest tests/extract tests/pipeline tests/libs tests/cli/common tests/destinations tests/sources
@@ -153,6 +155,20 @@ jobs:
name: Run extract tests Windows
shell: cmd
+ # here we upgrade pyarrow to 17 and run the libs tests again
+ - name: Install pyarrow 17
+ run: poetry run pip install pyarrow==17.0.0
+
+ - run: |
+ poetry run pytest tests/libs
+ if: runner.os != 'Windows'
+ name: Run libs tests Linux/MAC
+ - run: |
+ poetry run pytest tests/libs
+ if: runner.os == 'Windows'
+ name: Run libs tests Windows
+ shell: cmd
+
# - name: Install Pydantic 1.0
# run: pip install "pydantic<2"
diff --git a/.github/workflows/test_destinations.yml b/.github/workflows/test_destinations.yml
index 46096d36a8..df398e13ad 100644
--- a/.github/workflows/test_destinations.yml
+++ b/.github/workflows/test_destinations.yml
@@ -22,6 +22,7 @@ env:
TESTS__R2_AWS_ACCESS_KEY_ID: a4950a5003b26f5a71ac97ef3848ff4c
TESTS__R2_AWS_SECRET_ACCESS_KEY: ${{ secrets.CLOUDFLARE_R2_SECRET_ACCESS_KEY }}
TESTS__R2_ENDPOINT_URL: https://9830548e4e4b582989be0811f2a0a97f.r2.cloudflarestorage.com
+ TESTS__R2_REGION_NAME: us-east-1
# RUNTIME__SENTRY_DSN: https://6f6f7b6f8e0f458a89be4187603b55fe@o1061158.ingest.sentry.io/4504819859914752
RUNTIME__LOG_LEVEL: ERROR
@@ -67,13 +68,13 @@ jobs:
virtualenvs-in-project: true
installer-parallel: true
- - name: Load cached venv
- id: cached-poetry-dependencies
- uses: actions/cache@v3
- with:
- # path: ${{ steps.pip-cache.outputs.dir }}
- path: .venv
- key: venv-${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-${{ hashFiles('**/poetry.lock') }}-redshift
+ # - name: Load cached venv
+ # id: cached-poetry-dependencies
+ # uses: actions/cache@v3
+ # with:
+ # # path: ${{ steps.pip-cache.outputs.dir }}
+ # path: .venv
+ # key: venv-${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-${{ hashFiles('**/poetry.lock') }}-redshift
- name: Install dependencies
# if: steps.cached-poetry-dependencies.outputs.cache-hit != 'true'
@@ -82,9 +83,6 @@ jobs:
- name: create secrets.toml
run: pwd && echo "$DLT_SECRETS_TOML" > tests/.dlt/secrets.toml
- - name: clear duckdb secrets and cache
- run: rm -rf ~/.duckdb
-
- run: |
poetry run pytest tests/load --ignore tests/load/sources -m "essential"
name: Run essential tests Linux
diff --git a/.github/workflows/test_doc_snippets.yml b/.github/workflows/test_doc_snippets.yml
index e6d58376ba..ae06a72df9 100644
--- a/.github/workflows/test_doc_snippets.yml
+++ b/.github/workflows/test_doc_snippets.yml
@@ -91,7 +91,7 @@ jobs:
- name: Install dependencies
# if: steps.cached-poetry-dependencies.outputs.cache-hit != 'true'
- run: poetry install --no-interaction -E duckdb -E weaviate -E parquet -E qdrant -E bigquery -E postgres -E lancedb --with docs,sentry-sdk --without airflow
+ run: poetry install --no-interaction -E duckdb -E weaviate -E parquet -E qdrant -E bigquery -E postgres -E lancedb --with docs,sentry-sdk --without airflow -E s3
- name: create secrets.toml for examples
run: pwd && echo "$DLT_SECRETS_TOML" > docs/examples/.dlt/secrets.toml
diff --git a/.github/workflows/test_local_destinations.yml b/.github/workflows/test_local_destinations.yml
index a4548f6529..61bfe1551a 100644
--- a/.github/workflows/test_local_destinations.yml
+++ b/.github/workflows/test_local_destinations.yml
@@ -119,7 +119,6 @@ jobs:
DESTINATION__FILESYSTEM__CREDENTIALS__SFTP_USERNAME: foo
DESTINATION__FILESYSTEM__CREDENTIALS__SFTP_PASSWORD: pass
-
- name: Stop weaviate
if: always()
run: docker compose -f "tests/load/weaviate/docker-compose.yml" down -v
diff --git a/.github/workflows/test_pyarrow17.yml b/.github/workflows/test_pyarrow17.yml
deleted file mode 100644
index c18e020352..0000000000
--- a/.github/workflows/test_pyarrow17.yml
+++ /dev/null
@@ -1,83 +0,0 @@
-
-name: tests marked as needspyarrow17
-
-on:
- pull_request:
- branches:
- - master
- - devel
- workflow_dispatch:
- schedule:
- - cron: '0 2 * * *'
-
-concurrency:
- group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
- cancel-in-progress: true
-
-env:
-
- DLT_SECRETS_TOML: ${{ secrets.DLT_SECRETS_TOML }}
-
- # RUNTIME__SENTRY_DSN: https://6f6f7b6f8e0f458a89be4187603b55fe@o1061158.ingest.sentry.io/4504819859914752
- RUNTIME__LOG_LEVEL: ERROR
- RUNTIME__DLTHUB_TELEMETRY_ENDPOINT: ${{ secrets.RUNTIME__DLTHUB_TELEMETRY_ENDPOINT }}
-
- ACTIVE_DESTINATIONS: "[\"filesystem\"]"
- ALL_FILESYSTEM_DRIVERS: "[\"memory\", \"file\", \"r2\", \"s3\", \"gs\", \"az\", \"abfss\", \"gdrive\"]" #excludes sftp
-
-jobs:
- get_docs_changes:
- name: docs changes
- uses: ./.github/workflows/get_docs_changes.yml
- if: ${{ !github.event.pull_request.head.repo.fork || contains(github.event.pull_request.labels.*.name, 'ci from fork')}}
-
- run_pyarrow17:
- name: needspyarrow17 tests
- needs: get_docs_changes
- if: needs.get_docs_changes.outputs.changes_outside_docs == 'true'
- defaults:
- run:
- shell: bash
- runs-on: "ubuntu-latest"
-
- steps:
-
- - name: Check out
- uses: actions/checkout@master
-
- - name: Setup Python
- uses: actions/setup-python@v4
- with:
- python-version: "3.10.x"
-
- - name: Install Poetry
- uses: snok/install-poetry@v1.3.2
- with:
- virtualenvs-create: true
- virtualenvs-in-project: true
- installer-parallel: true
-
- - name: Load cached venv
- id: cached-poetry-dependencies
- uses: actions/cache@v3
- with:
- path: .venv
- key: venv-${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-${{ hashFiles('**/poetry.lock') }}-pyarrow17
-
- - name: Install dependencies
- run: poetry install --no-interaction --with sentry-sdk --with pipeline -E deltalake -E duckdb -E filesystem -E gs -E s3 -E az
-
-
- - name: Upgrade pyarrow
- run: poetry run pip install pyarrow==17.0.0
-
- - name: create secrets.toml
- run: pwd && echo "$DLT_SECRETS_TOML" > tests/.dlt/secrets.toml
-
- - name: clear duckdb secrets and cache
- run: rm -rf ~/.duckdb
-
- - name: Run needspyarrow17 tests Linux
- run: |
- poetry run pytest tests/libs -m "needspyarrow17"
- poetry run pytest tests/load -m "needspyarrow17"
diff --git a/Makefile b/Makefile
index 5d86d7febe..2a7f6dac0a 100644
--- a/Makefile
+++ b/Makefile
@@ -60,10 +60,13 @@ format:
poetry run black dlt docs tests --exclude=".*syntax_error.py|\.venv.*|_storage/.*"
# poetry run isort ./
-lint-and-test-snippets:
+lint-snippets:
cd docs/tools && poetry run python check_embedded_snippets.py full
- poetry run mypy --config-file mypy.ini docs/website docs/examples docs/tools --exclude docs/tools/lint_setup --exclude docs/website/docs_processed
- poetry run flake8 --max-line-length=200 docs/website docs/examples docs/tools
+
+
+lint-and-test-snippets: lint-snippets
+ poetry run mypy --config-file mypy.ini docs/website docs/tools --exclude docs/tools/lint_setup --exclude docs/website/docs_processed
+ poetry run flake8 --max-line-length=200 docs/website docs/tools --exclude docs/website/.dlt-repo
cd docs/website/docs && poetry run pytest --ignore=node_modules
lint-and-test-examples:
@@ -72,7 +75,6 @@ lint-and-test-examples:
poetry run mypy --config-file mypy.ini docs/examples
cd docs/examples && poetry run pytest
-
test-examples:
cd docs/examples && poetry run pytest
diff --git a/dlt/cli/__init__.py b/dlt/cli/__init__.py
index 2c129d95b7..d4016b611f 100644
--- a/dlt/cli/__init__.py
+++ b/dlt/cli/__init__.py
@@ -1 +1,4 @@
from dlt.cli.reference import SupportsCliCommand
+from dlt.cli.exceptions import CliCommandException
+
+__all__ = ["SupportsCliCommand", "CliCommandException"]
diff --git a/dlt/cli/config_toml_writer.py b/dlt/cli/config_toml_writer.py
index 59b16b16e1..5a981968b1 100644
--- a/dlt/cli/config_toml_writer.py
+++ b/dlt/cli/config_toml_writer.py
@@ -1,4 +1,4 @@
-from typing import Any, NamedTuple, Tuple, Iterable
+from typing import Any, NamedTuple, Tuple, Iterable, Mapping
import tomlkit
from tomlkit.items import Table as TOMLTable
from tomlkit.container import Container as TOMLContainer
@@ -72,7 +72,7 @@ def write_value(
hint = extract_inner_hint(hint)
if is_base_configuration_inner_hint(hint):
inner_table = tomlkit.table(is_super_table=True)
- write_spec(inner_table, hint(), overwrite_existing)
+ write_spec(inner_table, hint(), default_value, overwrite_existing)
if len(inner_table) > 0:
toml_table[name] = inner_table
else:
@@ -86,17 +86,31 @@ def write_value(
toml_table[name] = default_value
-def write_spec(toml_table: TOMLTable, config: BaseConfiguration, overwrite_existing: bool) -> None:
+def write_spec(
+ toml_table: TOMLTable,
+ config: BaseConfiguration,
+ initial_value: Mapping[str, Any],
+ overwrite_existing: bool,
+) -> None:
for name, hint in config.get_resolvable_fields().items():
+ # use initial value
+ initial_ = initial_value.get(name) if initial_value else None
+ # use default value stored in config
default_value = getattr(config, name, None)
+
# check if field is of particular interest and should be included if it has default
is_default_of_interest = name in config.__config_gen_annotations__
+
+ # if initial is different from default, it is of interest as well
+ if initial_ is not None:
+ is_default_of_interest = is_default_of_interest or (initial_ != default_value)
+
write_value(
toml_table,
name,
hint,
overwrite_existing,
- default_value=default_value,
+ default_value=initial_ or default_value,
is_default_of_interest=is_default_of_interest,
)
diff --git a/dlt/cli/init_command.py b/dlt/cli/init_command.py
index 16b51d64f1..ac8adcc588 100644
--- a/dlt/cli/init_command.py
+++ b/dlt/cli/init_command.py
@@ -32,6 +32,8 @@
from dlt.cli.config_toml_writer import WritableConfigValue, write_values
from dlt.cli.pipeline_files import (
TEMPLATE_FILES,
+ SOURCES_MODULE_NAME,
+ SINGLE_FILE_TEMPLATE_MODULE_NAME,
SourceConfiguration,
TVerifiedSourceFileEntry,
TVerifiedSourceFileIndex,
@@ -41,8 +43,6 @@
DLT_INIT_DOCS_URL = "https://dlthub.com/docs/reference/command-line-interface#dlt-init"
DEFAULT_VERIFIED_SOURCES_REPO = "https://github.com/dlt-hub/verified-sources.git"
-TEMPLATES_MODULE_NAME = "pipeline_templates"
-SOURCES_MODULE_NAME = "sources"
def _get_core_sources_storage() -> FileStorage:
@@ -57,7 +57,7 @@ def _get_templates_storage() -> FileStorage:
init_path = (
Path(os.path.dirname(os.path.realpath(__file__))).parent
/ SOURCES_MODULE_NAME
- / TEMPLATES_MODULE_NAME
+ / SINGLE_FILE_TEMPLATE_MODULE_NAME
)
return FileStorage(str(init_path))
@@ -382,6 +382,16 @@ def init_command(
source_configuration = files_ops.get_core_source_configuration(
core_sources_storage, source_name
)
+ from importlib.metadata import Distribution
+
+ dist = Distribution.from_name(DLT_PKG_NAME)
+ extras = dist.metadata.get_all("Provides-Extra") or []
+
+ # Match the extra name to the source name
+ canonical_source_name = source_name.replace("_", "-").lower()
+
+ if canonical_source_name in extras:
+ source_configuration.requirements.update_dlt_extras(canonical_source_name)
else:
if not is_valid_schema_name(source_name):
raise InvalidSchemaName(source_name)
diff --git a/dlt/cli/pipeline_command.py b/dlt/cli/pipeline_command.py
index d879281808..55f9e828aa 100644
--- a/dlt/cli/pipeline_command.py
+++ b/dlt/cli/pipeline_command.py
@@ -131,9 +131,8 @@ def _display_pending_packages() -> Tuple[Sequence[str], Sequence[str]]:
streamlit_cmd.append("--")
streamlit_cmd.append(pipeline_name)
- if pipelines_dir:
- streamlit_cmd.append("--pipelines-dir")
- streamlit_cmd.append(pipelines_dir)
+ streamlit_cmd.append("--pipelines-dir")
+ streamlit_cmd.append(p.pipelines_dir)
venv = Venv.restore_current()
for line in iter_stdout(venv, *streamlit_cmd):
diff --git a/dlt/cli/pipeline_files.py b/dlt/cli/pipeline_files.py
index c15f988e54..b6f8f85271 100644
--- a/dlt/cli/pipeline_files.py
+++ b/dlt/cli/pipeline_files.py
@@ -18,6 +18,11 @@
TSourceType = Literal["core", "verified", "template"]
SOURCES_INIT_INFO_ENGINE_VERSION = 1
+
+SOURCES_MODULE_NAME = "sources"
+CORE_SOURCE_TEMPLATE_MODULE_NAME = "_core_source_templates"
+SINGLE_FILE_TEMPLATE_MODULE_NAME = "_single_file_templates"
+
SOURCES_INIT_INFO_FILE = ".sources"
IGNORE_FILES = ["*.py[cod]", "*$py.class", "__pycache__", "py.typed", "requirements.txt"]
IGNORE_VERIFIED_SOURCES = [".*", "_*"]
@@ -25,10 +30,10 @@
".*",
"_*",
"helpers",
- "pipeline_templates",
+ SINGLE_FILE_TEMPLATE_MODULE_NAME,
+ CORE_SOURCE_TEMPLATE_MODULE_NAME,
]
PIPELINE_FILE_SUFFIX = "_pipeline.py"
-
# hardcode default template files here
TEMPLATE_FILES = [".gitignore", ".dlt/config.toml"]
DEFAULT_PIPELINE_TEMPLATE = "default_pipeline.py"
@@ -224,15 +229,16 @@ def get_template_configuration(
def get_core_source_configuration(
sources_storage: FileStorage, source_name: str
) -> SourceConfiguration:
- pipeline_file = source_name + "_pipeline.py"
+ src_pipeline_file = CORE_SOURCE_TEMPLATE_MODULE_NAME + "/" + source_name + PIPELINE_FILE_SUFFIX
+ dest_pipeline_file = source_name + PIPELINE_FILE_SUFFIX
return SourceConfiguration(
"core",
"dlt.sources." + source_name,
sources_storage,
- pipeline_file,
- pipeline_file,
- [],
+ src_pipeline_file,
+ dest_pipeline_file,
+ [".gitignore"],
SourceRequirements([]),
_get_docstring_for_module(sources_storage, source_name),
False,
@@ -247,7 +253,7 @@ def get_verified_source_configuration(
f"Verified source {source_name} could not be found in the repository", source_name
)
# find example script
- example_script = f"{source_name}_pipeline.py"
+ example_script = f"{source_name}{PIPELINE_FILE_SUFFIX}"
if not sources_storage.has_file(example_script):
raise VerifiedSourceRepoError(
f"Pipeline example script {example_script} could not be found in the repository",
diff --git a/dlt/common/configuration/accessors.py b/dlt/common/configuration/accessors.py
index a93d8e0b76..b605fb11cb 100644
--- a/dlt/common/configuration/accessors.py
+++ b/dlt/common/configuration/accessors.py
@@ -27,6 +27,13 @@ def __setitem__(self, field: str, value: Any) -> None:
key = sections.pop()
self.writable_provider.set_value(key, value, None, *sections)
+ def __contains__(self, field: str) -> bool:
+ try:
+ self[field]
+ return True
+ except KeyError:
+ return False
+
def get(self, field: str, expected_type: Type[TConfigAny] = None) -> TConfigAny:
value: TConfigAny
value, _ = self._get_value(field, expected_type)
diff --git a/dlt/common/configuration/container.py b/dlt/common/configuration/container.py
index 05680460e3..74cb23dea9 100644
--- a/dlt/common/configuration/container.py
+++ b/dlt/common/configuration/container.py
@@ -92,7 +92,7 @@ def _thread_context(
return self.main_context
else:
# thread pool names used in dlt contain originating thread id. use this id over pool id
- if m := re.match(r"dlt-pool-(\d+)-", threading.currentThread().getName()):
+ if m := re.match(r"dlt-pool-(\d+)-", threading.current_thread().name):
thread_id = int(m.group(1))
else:
thread_id = threading.get_ident()
diff --git a/dlt/common/configuration/plugins.py b/dlt/common/configuration/plugins.py
index ac9cdd56a8..b1f13af5b5 100644
--- a/dlt/common/configuration/plugins.py
+++ b/dlt/common/configuration/plugins.py
@@ -48,7 +48,7 @@ def load_setuptools_entrypoints(m: pluggy.PluginManager) -> None:
for dist in list(importlib.metadata.distributions()):
# skip named dists that do not start with dlt-
- if hasattr(dist, "name") and not dist.name.startswith("dlt-"):
+ if hasattr(dist, "name") and (dist.name is None or not dist.name.startswith("dlt-")):
continue
for ep in dist.entry_points:
if (
diff --git a/dlt/common/configuration/providers/toml.py b/dlt/common/configuration/providers/toml.py
index a680be4f3a..3636565fae 100644
--- a/dlt/common/configuration/providers/toml.py
+++ b/dlt/common/configuration/providers/toml.py
@@ -1,5 +1,6 @@
import os
import tomlkit
+import tomlkit.exceptions
import tomlkit.items
from typing import Any, Optional
@@ -53,6 +54,10 @@ def __init__(
it will additionally look for `file_name` in `dlt` global dir (home dir by default) and merge the content.
The "settings" (`settings_dir`) values overwrite the "global" values.
+ If toml file under `settings_dir` is not found it will look into Google Colab userdata object for a value
+ with name `file_name` and load toml file from it.
+ If that one is not found, it will try to load Streamlit `secrets.toml` file.
+
If none of the files exist, an empty provider is created.
Args:
@@ -65,6 +70,10 @@ def __init__(
Raises:
TomlProviderReadException: File could not be read, most probably `toml` parsing error
"""
+ # set supports_secrets early, we need this flag to read config
+ self._supports_secrets = supports_secrets
+ # read toml file from local or from various environments
+
self._toml_path = os.path.join(settings_dir, file_name)
self._global_dir = os.path.join(global_dir, file_name) if global_dir else None
self._config_toml = self._read_toml_files(
@@ -111,27 +120,78 @@ def set_fragment(
def to_toml(self) -> str:
return tomlkit.dumps(self._config_toml)
- @staticmethod
- def _read_toml_files(
- name: str, file_name: str, toml_path: str, global_path: str
- ) -> tomlkit.TOMLDocument:
+ def _read_google_colab_secrets(self, name: str, file_name: str) -> tomlkit.TOMLDocument:
+ """Try to load the toml from google colab userdata object"""
try:
- project_toml = SettingsTomlProvider._read_toml(toml_path)
- if global_path:
- global_toml = SettingsTomlProvider._read_toml(global_path)
- project_toml = update_dict_nested(global_toml, project_toml)
- return project_toml
- except Exception as ex:
- raise TomlProviderReadException(name, file_name, toml_path, str(ex))
+ from google.colab import userdata
+
+ try:
+ return tomlkit.loads(userdata.get(file_name))
+ except (userdata.SecretNotFoundError, userdata.NotebookAccessError):
+ # document not found if secret does not exist or we have no permission
+ return None
+ except ImportError:
+ # document not found if google colab context does not exist
+ return None
+
+ def _read_streamlit_secrets(self, name: str, file_name: str) -> tomlkit.TOMLDocument:
+ """Try to load the toml from Streamlit secrets."""
+ # only secrets can come from streamlit
+ if not self.supports_secrets:
+ return None
- @staticmethod
- def _read_toml(toml_path: str) -> tomlkit.TOMLDocument:
+ try:
+ import streamlit as st
+ import streamlit.runtime as st_r # type: ignore
+
+ if not st_r.exists():
+ return None
+
+ # Access the entire secrets store
+ secrets_ = st.secrets
+ if secrets_.load_if_toml_exists():
+ # Convert the dictionary to a TOML string
+ toml_str = tomlkit.dumps(secrets_.to_dict())
+
+ # Parse the TOML string into a TOMLDocument
+ toml_doc = tomlkit.parse(toml_str)
+ return toml_doc
+ else:
+ return None
+ except tomlkit.exceptions.TOMLKitError:
+ raise
+ except Exception:
+ # Not in a Streamlit context
+ return None
+
+ def _read_toml_file(self, toml_path: str) -> tomlkit.TOMLDocument:
if os.path.isfile(toml_path):
with open(toml_path, "r", encoding="utf-8") as f:
# use whitespace preserving parser
return tomlkit.load(f)
else:
- return tomlkit.document()
+ return None
+
+ def _read_toml_files(
+ self, name: str, file_name: str, toml_path: str, global_path: str
+ ) -> tomlkit.TOMLDocument:
+ try:
+ if (project_toml := self._read_toml_file(toml_path)) is not None:
+ pass
+ elif (project_toml := self._read_google_colab_secrets(name, file_name)) is not None:
+ pass
+ elif (project_toml := self._read_streamlit_secrets(name, file_name)) is not None:
+ pass
+ else:
+ # empty doc
+ project_toml = tomlkit.document()
+ if global_path:
+ global_toml = self._read_toml_file(global_path)
+ if global_toml is not None:
+ project_toml = update_dict_nested(global_toml, project_toml)
+ return project_toml
+ except Exception as ex:
+ raise TomlProviderReadException(name, file_name, toml_path, str(ex))
class ConfigTomlProvider(SettingsTomlProvider):
diff --git a/dlt/common/configuration/resolve.py b/dlt/common/configuration/resolve.py
index e13701def5..97bcfd315e 100644
--- a/dlt/common/configuration/resolve.py
+++ b/dlt/common/configuration/resolve.py
@@ -283,7 +283,7 @@ def _resolve_config_fields(
unresolved_fields[key] = traces
# set resolved value in config
if default_value != current_value:
- if not is_hint_not_resolvable(hint):
+ if not is_hint_not_resolvable(hint) or explicit_value is not None or explicit_none:
# ignore final types
setattr(config, key, current_value)
diff --git a/dlt/common/configuration/specs/aws_credentials.py b/dlt/common/configuration/specs/aws_credentials.py
index dd40d3b775..5f69be6a33 100644
--- a/dlt/common/configuration/specs/aws_credentials.py
+++ b/dlt/common/configuration/specs/aws_credentials.py
@@ -51,6 +51,8 @@ def to_session_credentials(self) -> Dict[str, str]:
def to_object_store_rs_credentials(self) -> Dict[str, str]:
# https://docs.rs/object_store/latest/object_store/aws
+ # NOTE: delta rs will set the values below in env variables of the current process
+ # https://github.com/delta-io/delta-rs/blob/bdf1c4e765ca457e49d4fa53335d42736220f57f/rust/src/storage/s3.rs#L257
creds = cast(
Dict[str, str],
without_none(
@@ -64,8 +66,8 @@ def to_object_store_rs_credentials(self) -> Dict[str, str]:
),
)
- if "endpoint_url" not in creds: # AWS S3
- if "region" not in creds:
+ if not self.endpoint_url: # AWS S3
+ if not self.region_name:
raise ObjectStoreRsCredentialsException(
"`object_store` Rust crate requires AWS region when using AWS S3."
)
diff --git a/dlt/common/configuration/specs/azure_credentials.py b/dlt/common/configuration/specs/azure_credentials.py
index 371a988109..c2071e2188 100644
--- a/dlt/common/configuration/specs/azure_credentials.py
+++ b/dlt/common/configuration/specs/azure_credentials.py
@@ -1,17 +1,21 @@
from typing import Optional, Dict, Any, Union
from dlt.common.pendulum import pendulum
+from dlt.common.exceptions import MissingDependencyException
from dlt.common.typing import TSecretStrValue
from dlt.common.configuration.specs import (
CredentialsConfiguration,
CredentialsWithDefault,
configspec,
)
+from dlt import version
+
+_AZURE_STORAGE_EXTRA = f"{version.DLT_PKG_NAME}[az]"
@configspec
class AzureCredentialsWithoutDefaults(CredentialsConfiguration):
- """Credentials for azure blob storage, compatible with adlfs"""
+ """Credentials for Azure Blob Storage, compatible with adlfs"""
azure_storage_account_name: str = None
azure_storage_account_key: Optional[TSecretStrValue] = None
@@ -37,7 +41,10 @@ def to_object_store_rs_credentials(self) -> Dict[str, str]:
return creds
def create_sas_token(self) -> None:
- from azure.storage.blob import generate_account_sas, ResourceTypes
+ try:
+ from azure.storage.blob import generate_account_sas, ResourceTypes
+ except ModuleNotFoundError:
+ raise MissingDependencyException(self.__class__.__name__, [_AZURE_STORAGE_EXTRA])
self.azure_storage_sas_token = generate_account_sas(
account_name=self.azure_storage_account_name,
@@ -78,7 +85,10 @@ def to_object_store_rs_credentials(self) -> Dict[str, str]:
@configspec
class AzureCredentials(AzureCredentialsWithoutDefaults, CredentialsWithDefault):
def on_partial(self) -> None:
- from azure.identity import DefaultAzureCredential
+ try:
+ from azure.identity import DefaultAzureCredential
+ except ModuleNotFoundError:
+ raise MissingDependencyException(self.__class__.__name__, [_AZURE_STORAGE_EXTRA])
if not self.azure_storage_account_key and not self.azure_storage_sas_token:
self._set_default_credentials(DefaultAzureCredential())
@@ -99,7 +109,10 @@ class AzureServicePrincipalCredentials(
AzureServicePrincipalCredentialsWithoutDefaults, CredentialsWithDefault
):
def on_partial(self) -> None:
- from azure.identity import DefaultAzureCredential
+ try:
+ from azure.identity import DefaultAzureCredential
+ except ModuleNotFoundError:
+ raise MissingDependencyException(self.__class__.__name__, [_AZURE_STORAGE_EXTRA])
self._set_default_credentials(DefaultAzureCredential())
if self.azure_storage_account_name:
diff --git a/dlt/common/configuration/specs/gcp_credentials.py b/dlt/common/configuration/specs/gcp_credentials.py
index 7d852dd67e..60ab1d4b56 100644
--- a/dlt/common/configuration/specs/gcp_credentials.py
+++ b/dlt/common/configuration/specs/gcp_credentials.py
@@ -52,7 +52,7 @@ def __str__(self) -> str:
def to_gcs_credentials(self) -> Dict[str, Any]:
"""
- Dict of keyword arguments can be passed to gcsfs.
+ Dict of keyword arguments that can be passed to gcsfs.
Delegates default GCS credential handling to gcsfs.
"""
return {
@@ -64,6 +64,15 @@ def to_gcs_credentials(self) -> Dict[str, Any]:
),
}
+ def to_object_store_rs_credentials(self) -> Dict[str, str]:
+ """
+ Dict of keyword arguments that can be passed to `object_store` Rust crate.
+ Delegates default GCS credential handling to `object_store` Rust crate.
+ """
+ if isinstance(self, CredentialsWithDefault) and self.has_default_credentials():
+ return {}
+ return {"service_account_key": json.dumps(dict(self))}
+
@configspec
class GcpServiceAccountCredentialsWithoutDefaults(GcpCredentials):
@@ -117,10 +126,6 @@ def to_native_credentials(self) -> Any:
else:
return ServiceAccountCredentials.from_service_account_info(self)
- def to_object_store_rs_credentials(self) -> Dict[str, str]:
- # https://docs.rs/object_store/latest/object_store/gcp
- return {"service_account_key": json.dumps(dict(self))}
-
def __str__(self) -> str:
return f"{self.client_email}@{self.project_id}"
diff --git a/dlt/common/configuration/utils.py b/dlt/common/configuration/utils.py
index 7b1ed72d2c..31bc672f7a 100644
--- a/dlt/common/configuration/utils.py
+++ b/dlt/common/configuration/utils.py
@@ -228,4 +228,6 @@ def add_config_dict_to_env(
destructure_dicts=destructure_dicts,
)
else:
- os.environ[env_key] = serialize_value(v)
+ # skip non-serializable fields
+ with contextlib.suppress(TypeError):
+ os.environ[env_key] = serialize_value(v)
diff --git a/dlt/common/data_writers/escape.py b/dlt/common/data_writers/escape.py
index 06c8d7a95a..393e9e8508 100644
--- a/dlt/common/data_writers/escape.py
+++ b/dlt/common/data_writers/escape.py
@@ -79,6 +79,23 @@ def escape_duckdb_literal(v: Any) -> Any:
return str(v)
+def escape_lancedb_literal(v: Any) -> Any:
+ if isinstance(v, str):
+ # we escape extended string which behave like the redshift string
+ return _escape_extended(v, prefix="'")
+ if isinstance(v, (datetime, date, time)):
+ return f"'{v.isoformat()}'"
+ if isinstance(v, (list, dict)):
+ return _escape_extended(json.dumps(v), prefix="'")
+ # TODO: check how binaries are represented in fusion
+ if isinstance(v, bytes):
+ return f"from_base64('{base64.b64encode(v).decode('ascii')}')"
+ if v is None:
+ return "NULL"
+
+ return str(v)
+
+
MS_SQL_ESCAPE_DICT = {
"'": "''",
"\n": "' + CHAR(10) + N'",
diff --git a/dlt/common/destination/__init__.py b/dlt/common/destination/__init__.py
index 2f50b3e3d2..13612a2976 100644
--- a/dlt/common/destination/__init__.py
+++ b/dlt/common/destination/__init__.py
@@ -4,7 +4,7 @@
TLoaderFileFormat,
LOADER_FILE_FORMATS,
)
-from dlt.common.destination.reference import TDestinationReferenceArg, Destination, TDestination
+from dlt.common.destination.reference import TDestinationReferenceArg, Destination, AnyDestination
from dlt.common.destination.typing import PreparedTableSchema
__all__ = [
@@ -15,5 +15,5 @@
"PreparedTableSchema",
"TDestinationReferenceArg",
"Destination",
- "TDestination",
+ "AnyDestination",
]
diff --git a/dlt/common/destination/reference.py b/dlt/common/destination/reference.py
index 8b3819e32b..6c198dd468 100644
--- a/dlt/common/destination/reference.py
+++ b/dlt/common/destination/reference.py
@@ -1,7 +1,6 @@
from abc import ABC, abstractmethod
import dataclasses
from importlib import import_module
-from contextlib import contextmanager
from types import TracebackType
from typing import (
@@ -25,6 +24,7 @@
Protocol,
Tuple,
AnyStr,
+ overload,
)
from typing_extensions import Annotated
import datetime # noqa: 251
@@ -61,6 +61,7 @@
from dlt.common.storages.load_storage import ParsedLoadJobFileName
from dlt.common.storages.load_package import LoadJobInfo, TPipelineStateDoc
from dlt.common.exceptions import MissingDependencyException
+from dlt.common.typing import is_optional_type
TDestinationConfig = TypeVar("TDestinationConfig", bound="DestinationClientConfiguration")
@@ -260,12 +261,10 @@ def normalize_dataset_name(self, schema: Schema) -> str:
def normalize_staging_dataset_name(self, schema: Schema) -> str:
"""Builds staging dataset name out of dataset_name and staging_dataset_name_layout."""
if "%s" in self.staging_dataset_name_layout:
- # if dataset name is empty, staging dataset name is also empty
+ # staging dataset name is never empty, otherwise table names must clash
dataset_name = self._make_dataset_name(schema.name)
- if not dataset_name:
- return dataset_name
# fill the placeholder
- dataset_name = self.staging_dataset_name_layout % dataset_name
+ dataset_name = self.staging_dataset_name_layout % (dataset_name or "")
else:
# no placeholder, then layout is a full name. so you can have a single staging dataset
dataset_name = self.staging_dataset_name_layout
@@ -276,6 +275,15 @@ def normalize_staging_dataset_name(self, schema: Schema) -> str:
else dataset_name
)
+ @classmethod
+ def needs_dataset_name(cls) -> bool:
+ """Checks if configuration requires dataset name to be present. Empty datasets are allowed
+ ie. for schema-less destinations like weaviate or clickhouse
+ """
+ fields = cls.get_resolvable_fields()
+ dataset_name_type = fields["dataset_name"]
+ return not is_optional_type(dataset_name_type)
+
def _make_dataset_name(self, schema_name: str) -> str:
if not schema_name:
raise ValueError("schema_name is None or empty")
@@ -407,6 +415,8 @@ def run_managed(
"""
wrapper around the user implemented run method
"""
+ from dlt.common.runtime import signals
+
# only jobs that are not running or have not reached a final state
# may be started
assert self._state in ("ready", "retry")
@@ -433,6 +443,8 @@ def run_managed(
self._finished_at = pendulum.now()
# sanity check
assert self._state in ("completed", "retry", "failed")
+ # wake up waiting threads
+ signals.wake_all()
@abstractmethod
def run(self) -> None:
@@ -471,7 +483,7 @@ def create_followup_jobs(self, final_state: TLoadJobState) -> List[FollowupJobRe
class SupportsReadableRelation(Protocol):
"""A readable relation retrieved from a destination that supports it"""
- schema_columns: TTableSchemaColumns
+ columns_schema: TTableSchemaColumns
"""Known dlt table columns for this relation"""
def df(self, chunk_size: int = None) -> Optional[DataFrame]:
@@ -490,19 +502,61 @@ def df(self, chunk_size: int = None) -> Optional[DataFrame]:
"""
...
- def arrow(self, chunk_size: int = None) -> Optional[ArrowTable]: ...
+ # accessing data
+ def arrow(self, chunk_size: int = None) -> Optional[ArrowTable]:
+ """fetch arrow table of first 'chunk_size' items"""
+ ...
+
+ def iter_df(self, chunk_size: int) -> Generator[DataFrame, None, None]:
+ """iterate over data frames tables of 'chunk_size' items"""
+ ...
- def iter_df(self, chunk_size: int) -> Generator[DataFrame, None, None]: ...
+ def iter_arrow(self, chunk_size: int) -> Generator[ArrowTable, None, None]:
+ """iterate over arrow tables of 'chunk_size' items"""
+ ...
- def iter_arrow(self, chunk_size: int) -> Generator[ArrowTable, None, None]: ...
+ def fetchall(self) -> List[Tuple[Any, ...]]:
+ """fetch all items as list of python tuples"""
+ ...
- def fetchall(self) -> List[Tuple[Any, ...]]: ...
+ def fetchmany(self, chunk_size: int) -> List[Tuple[Any, ...]]:
+ """fetch first 'chunk_size' items as list of python tuples"""
+ ...
- def fetchmany(self, chunk_size: int) -> List[Tuple[Any, ...]]: ...
+ def iter_fetch(self, chunk_size: int) -> Generator[List[Tuple[Any, ...]], Any, Any]:
+ """iterate in lists of python tuples in 'chunk_size' chunks"""
+ ...
- def iter_fetch(self, chunk_size: int) -> Generator[List[Tuple[Any, ...]], Any, Any]: ...
+ def fetchone(self) -> Optional[Tuple[Any, ...]]:
+ """fetch first item as python tuple"""
+ ...
- def fetchone(self) -> Optional[Tuple[Any, ...]]: ...
+ # modifying access parameters
+ def limit(self, limit: int) -> "SupportsReadableRelation":
+ """limit the result to 'limit' items"""
+ ...
+
+ def head(self, limit: int = 5) -> "SupportsReadableRelation":
+ """limit the result to 5 items by default"""
+ ...
+
+ def select(self, *columns: str) -> "SupportsReadableRelation":
+ """set which columns will be selected"""
+ ...
+
+ @overload
+ def __getitem__(self, column: str) -> "SupportsReadableRelation": ...
+
+ @overload
+ def __getitem__(self, columns: Sequence[str]) -> "SupportsReadableRelation": ...
+
+ def __getitem__(self, columns: Union[str, Sequence[str]]) -> "SupportsReadableRelation":
+ """set which columns will be selected"""
+ ...
+
+ def __copy__(self) -> "SupportsReadableRelation":
+ """create a copy of the relation object"""
+ ...
class DBApiCursor(SupportsReadableRelation):
@@ -946,4 +1000,4 @@ def from_reference(
return dest
-TDestination = Destination[DestinationClientConfiguration, JobClientBase]
+AnyDestination = Destination[DestinationClientConfiguration, JobClientBase]
diff --git a/dlt/common/libs/deltalake.py b/dlt/common/libs/deltalake.py
index 9caba55183..4047bc3a1a 100644
--- a/dlt/common/libs/deltalake.py
+++ b/dlt/common/libs/deltalake.py
@@ -24,7 +24,10 @@
)
-def ensure_delta_compatible_arrow_schema(schema: pa.Schema) -> pa.Schema:
+def ensure_delta_compatible_arrow_schema(
+ schema: pa.Schema,
+ partition_by: Optional[Union[List[str], str]] = None,
+) -> pa.Schema:
"""Returns Arrow schema compatible with Delta table format.
Casts schema to replace data types not supported by Delta.
@@ -35,12 +38,24 @@ def ensure_delta_compatible_arrow_schema(schema: pa.Schema) -> pa.Schema:
pa.types.is_time: pa.string(),
pa.types.is_decimal256: pa.string(), # pyarrow does not allow downcasting to decimal128
}
+
+ # partition fields can't be dictionary: https://github.com/delta-io/delta-rs/issues/2969
+ if partition_by is not None:
+ if isinstance(partition_by, str):
+ partition_by = [partition_by]
+ if any(pa.types.is_dictionary(schema.field(col).type) for col in partition_by):
+ # cast all dictionary fields to string — this is rogue because
+ # 1. dictionary value type is disregarded
+ # 2. any non-partition dictionary fields are cast too
+ ARROW_TO_DELTA_COMPATIBLE_ARROW_TYPE_MAP[pa.types.is_dictionary] = pa.string()
+
# NOTE: also consider calling _convert_pa_schema_to_delta() from delta.schema which casts unsigned types
return cast_arrow_schema_types(schema, ARROW_TO_DELTA_COMPATIBLE_ARROW_TYPE_MAP)
def ensure_delta_compatible_arrow_data(
- data: Union[pa.Table, pa.RecordBatchReader]
+ data: Union[pa.Table, pa.RecordBatchReader],
+ partition_by: Optional[Union[List[str], str]] = None,
) -> Union[pa.Table, pa.RecordBatchReader]:
"""Returns Arrow data compatible with Delta table format.
@@ -53,7 +68,7 @@ def ensure_delta_compatible_arrow_data(
version="17.0.0",
msg="`pyarrow>=17.0.0` is needed for `delta` table format on `filesystem` destination.",
)
- schema = ensure_delta_compatible_arrow_schema(data.schema)
+ schema = ensure_delta_compatible_arrow_schema(data.schema, partition_by)
return data.cast(schema)
@@ -87,7 +102,7 @@ def write_delta_table(
# is released
write_deltalake( # type: ignore[call-overload]
table_or_uri=table_or_uri,
- data=ensure_delta_compatible_arrow_data(data),
+ data=ensure_delta_compatible_arrow_data(data, partition_by),
partition_by=partition_by,
mode=get_delta_write_mode(write_disposition),
schema_mode="merge", # enable schema evolution (adding new columns)
@@ -116,9 +131,10 @@ def merge_delta_table(
primary_keys = get_columns_names_with_prop(schema, "primary_key")
predicate = " AND ".join([f"target.{c} = source.{c}" for c in primary_keys])
+ partition_by = get_columns_names_with_prop(schema, "partition")
qry = (
table.merge(
- source=ensure_delta_compatible_arrow_data(data),
+ source=ensure_delta_compatible_arrow_data(data, partition_by),
predicate=predicate,
source_alias="source",
target_alias="target",
diff --git a/dlt/common/normalizers/json/relational.py b/dlt/common/normalizers/json/relational.py
index ad4b017336..c5338192a0 100644
--- a/dlt/common/normalizers/json/relational.py
+++ b/dlt/common/normalizers/json/relational.py
@@ -15,7 +15,6 @@
TColumnName,
TSimpleRegex,
DLT_NAME_PREFIX,
- TTableSchema,
)
from dlt.common.schema.utils import (
column_name_validator,
@@ -100,32 +99,31 @@ def _flatten(
) -> Tuple[DictStrAny, Dict[Tuple[str, ...], Sequence[Any]]]:
out_rec_row: DictStrAny = {}
out_rec_list: Dict[Tuple[str, ...], Sequence[Any]] = {}
- schema_naming = self.schema.naming
def norm_row_dicts(dict_row: StrAny, __r_lvl: int, path: Tuple[str, ...] = ()) -> None:
for k, v in dict_row.items():
if k.strip():
- norm_k = schema_naming.normalize_identifier(k)
+ norm_k = self._normalize_identifier(self.schema, k)
else:
# for empty keys in the data use _
norm_k = self.EMPTY_KEY_IDENTIFIER
# if norm_k != k:
# print(f"{k} -> {norm_k}")
nested_name = (
- norm_k if path == () else schema_naming.shorten_fragments(*path, norm_k)
+ norm_k if path == () else self._shorten_fragments(self.schema, *path, norm_k)
)
# for lists and dicts we must check if type is possibly nested
if isinstance(v, (dict, list)):
- if not self._is_nested_type(
- self.schema, table, nested_name, self.max_nesting, __r_lvl
- ):
+ if not self._is_nested_type(self.schema, table, nested_name, __r_lvl):
# TODO: if schema contains table {table}__{nested_name} then convert v into single element list
if isinstance(v, dict):
# flatten the dict more
- norm_row_dicts(v, __r_lvl + 1, path + (norm_k,))
+ norm_row_dicts(v, __r_lvl - 1, path + (norm_k,))
else:
# pass the list to out_rec_list
- out_rec_list[path + (schema_naming.normalize_table_identifier(k),)] = v
+ out_rec_list[
+ path + (self._normalize_table_identifier(self.schema, k),)
+ ] = v
continue
else:
# pass the nested value to out_rec_row
@@ -174,9 +172,9 @@ def _add_row_id(
flattened_row: DictStrAny,
parent_row_id: str,
pos: int,
- _r_lvl: int,
+ is_root: bool = False,
) -> str:
- if _r_lvl == 0: # root table
+ if is_root: # root table
row_id_type = self._get_root_row_id_type(self.schema, table)
if row_id_type in ("key_hash", "row_hash"):
subset = None
@@ -201,14 +199,14 @@ def _add_row_id(
flattened_row[self.c_dlt_id] = row_id
return row_id
- def _get_propagated_values(self, table: str, row: DictStrAny, _r_lvl: int) -> StrAny:
+ def _get_propagated_values(self, table: str, row: DictStrAny, is_root: bool) -> StrAny:
extend: DictStrAny = {}
config = self.propagation_config
if config:
# mapping(k:v): propagate property with name "k" as property with name "v" in nested table
mappings: Dict[TColumnName, TColumnName] = {}
- if _r_lvl == 0:
+ if is_root:
mappings.update(config.get("root") or {})
if table in (config.get("tables") or {}):
mappings.update(config["tables"][table])
@@ -229,7 +227,7 @@ def _normalize_list(
parent_row_id: Optional[str] = None,
_r_lvl: int = 0,
) -> TNormalizedRowIterator:
- table = self.schema.naming.shorten_fragments(*parent_path, *ident_path)
+ table = self._shorten_fragments(self.schema, *parent_path, *ident_path)
for idx, v in enumerate(seq):
if isinstance(v, dict):
@@ -246,14 +244,14 @@ def _normalize_list(
parent_path,
parent_row_id,
idx,
- _r_lvl + 1,
+ _r_lvl - 1,
)
else:
# found non-dict in seq, so wrap it
wrap_v = wrap_in_dict(self.c_value, v)
DataItemNormalizer._extend_row(extend, wrap_v)
- self._add_row_id(table, wrap_v, wrap_v, parent_row_id, idx, _r_lvl)
- yield (table, self.schema.naming.shorten_fragments(*parent_path)), wrap_v
+ self._add_row_id(table, wrap_v, wrap_v, parent_row_id, idx)
+ yield (table, self._shorten_fragments(self.schema, *parent_path)), wrap_v
def _normalize_row(
self,
@@ -264,9 +262,10 @@ def _normalize_row(
parent_row_id: Optional[str] = None,
pos: Optional[int] = None,
_r_lvl: int = 0,
+ is_root: bool = False,
) -> TNormalizedRowIterator:
schema = self.schema
- table = schema.naming.shorten_fragments(*parent_path, *ident_path)
+ table = self._shorten_fragments(schema, *parent_path, *ident_path)
# flatten current row and extract all lists to recur into
flattened_row, lists = self._flatten(table, dict_row, _r_lvl)
# always extend row
@@ -274,14 +273,14 @@ def _normalize_row(
# infer record hash or leave existing primary key if present
row_id = flattened_row.get(self.c_dlt_id, None)
if not row_id:
- row_id = self._add_row_id(table, dict_row, flattened_row, parent_row_id, pos, _r_lvl)
+ row_id = self._add_row_id(table, dict_row, flattened_row, parent_row_id, pos, is_root)
# find fields to propagate to nested tables in config
- extend.update(self._get_propagated_values(table, flattened_row, _r_lvl))
+ extend.update(self._get_propagated_values(table, flattened_row, is_root))
# yield parent table first
should_descend = yield (
- (table, schema.naming.shorten_fragments(*parent_path)),
+ (table, self._shorten_fragments(schema, *parent_path)),
flattened_row,
)
if should_descend is False:
@@ -295,7 +294,7 @@ def _normalize_row(
list_path,
parent_path + ident_path,
row_id,
- _r_lvl + 1,
+ _r_lvl - 1,
)
def extend_schema(self) -> None:
@@ -361,10 +360,16 @@ def normalize_data_item(
row = cast(DictStrAny, item)
# identify load id if loaded data must be processed after loading incrementally
row[self.c_dlt_load_id] = load_id
+ # get table name and nesting level
+ root_table_name = self._normalize_table_identifier(self.schema, table_name)
+ max_nesting = self._get_table_nesting_level(self.schema, root_table_name, self.max_nesting)
+
yield from self._normalize_row(
row,
{},
- (self.schema.naming.normalize_table_identifier(table_name),),
+ (root_table_name,),
+ _r_lvl=max_nesting, # we count backwards
+ is_root=True,
)
@classmethod
@@ -422,12 +427,39 @@ def _normalize_prop(
validator_f=column_name_validator(schema.naming),
)
+ #
+ # Cached helper methods for all operations that are called often
+ #
+ @staticmethod
+ @lru_cache(maxsize=None)
+ def _shorten_fragments(schema: Schema, *idents: str) -> str:
+ return schema.naming.shorten_fragments(*idents)
+
+ @staticmethod
+ @lru_cache(maxsize=None)
+ def _normalize_table_identifier(schema: Schema, table_name: str) -> str:
+ return schema.naming.normalize_table_identifier(table_name)
+
@staticmethod
- def _get_table_nesting_level(schema: Schema, table_name: str) -> Optional[int]:
+ @lru_cache(maxsize=None)
+ def _normalize_identifier(schema: Schema, identifier: str) -> str:
+ return schema.naming.normalize_path(identifier)
+
+ @staticmethod
+ @lru_cache(maxsize=None)
+ def _get_table_nesting_level(
+ schema: Schema, table_name: str, default_nesting: int = 1000
+ ) -> Optional[int]:
+ """gets table nesting level, will inherit from parent if not set"""
+
table = schema.tables.get(table_name)
- if table:
- return table.get("x-normalizer", {}).get("max_nesting") # type: ignore
- return None
+ if (
+ table
+ and (max_nesting := cast(int, table.get("x-normalizer", {}).get("max_nesting")))
+ is not None
+ ):
+ return max_nesting
+ return default_nesting
@staticmethod
@lru_cache(maxsize=None)
@@ -440,18 +472,18 @@ def _get_primary_key(schema: Schema, table_name: str) -> List[str]:
@staticmethod
@lru_cache(maxsize=None)
def _is_nested_type(
- schema: Schema, table_name: str, field_name: str, max_nesting: int, _r_lvl: int
+ schema: Schema,
+ table_name: str,
+ field_name: str,
+ _r_lvl: int,
) -> bool:
"""For those paths the nested objects should be left in place.
Cache perf: max_nesting < _r_lvl: ~2x faster, full check 10x faster
"""
- # turn everything at the recursion level into nested type
- max_table_nesting = DataItemNormalizer._get_table_nesting_level(schema, table_name)
- if max_table_nesting is not None:
- max_nesting = max_table_nesting
- assert _r_lvl <= max_nesting
- if _r_lvl == max_nesting:
+ # nesting level is counted backwards
+ # is we have traversed to or beyond the calculated nesting level, we detect a nested type
+ if _r_lvl <= 0:
return True
column: TColumnSchema = None
diff --git a/dlt/common/pipeline.py b/dlt/common/pipeline.py
index bc7584b39e..dba1036f85 100644
--- a/dlt/common/pipeline.py
+++ b/dlt/common/pipeline.py
@@ -31,7 +31,7 @@
from dlt.common.configuration.specs import ContainerInjectableContext
from dlt.common.configuration.specs.config_section_context import ConfigSectionContext
from dlt.common.configuration.specs import RuntimeConfiguration
-from dlt.common.destination import TDestinationReferenceArg, TDestination
+from dlt.common.destination import TDestinationReferenceArg, AnyDestination
from dlt.common.destination.exceptions import DestinationHasFailedJobs
from dlt.common.exceptions import (
PipelineStateNotAvailable,
@@ -446,6 +446,8 @@ class TPipelineLocalState(TypedDict, total=False):
"""Timestamp indicating when the state was synced with the destination."""
_last_extracted_hash: str
"""Hash of state that was recently synced with destination"""
+ initial_cwd: str
+ """Current working dir when pipeline was instantiated for a first time"""
class TPipelineState(TVersionedState, total=False):
@@ -480,7 +482,7 @@ class SupportsPipeline(Protocol):
"""Name of the pipeline"""
default_schema_name: str
"""Name of the default schema"""
- destination: TDestination
+ destination: AnyDestination
"""The destination reference which is ModuleType. `destination.__name__` returns the name string"""
dataset_name: str
"""Name of the dataset to which pipeline will be loaded to"""
diff --git a/dlt/common/runtime/exec_info.py b/dlt/common/runtime/exec_info.py
index 3aa19c83ab..b894b3aad8 100644
--- a/dlt/common/runtime/exec_info.py
+++ b/dlt/common/runtime/exec_info.py
@@ -52,6 +52,8 @@ def exec_info_names() -> List[TExecInfoNames]:
names.append("aws_lambda")
if is_gcp_cloud_function():
names.append("gcp_cloud_function")
+ if is_streamlit():
+ names.append("streamlit")
return names
@@ -63,6 +65,10 @@ def is_github_actions() -> bool:
return "GITHUB_ACTIONS" in os.environ
+def is_streamlit() -> bool:
+ return "STREAMLIT_SERVER_PORT" in os.environ
+
+
def is_notebook() -> bool:
try:
return bool(str(get_ipython())) # type: ignore
diff --git a/dlt/common/runtime/run_context.py b/dlt/common/runtime/run_context.py
index 6eb8ca5f67..252b1084ae 100644
--- a/dlt/common/runtime/run_context.py
+++ b/dlt/common/runtime/run_context.py
@@ -62,7 +62,7 @@ def data_dir(self) -> str:
return os.path.join("/var", "dlt")
home = os.path.expanduser("~")
- if home is None:
+ if home is None or not is_folder_writable(home):
# no home dir - use temp
return os.path.join(tempfile.gettempdir(), "dlt")
else:
@@ -118,6 +118,21 @@ def plug_run_context_impl(
return RunContext(run_dir)
+def is_folder_writable(path: str) -> bool:
+ import tempfile
+
+ try:
+ # Ensure the path exists
+ if not os.path.exists(path):
+ return False
+ # Attempt to create a temporary file
+ with tempfile.TemporaryFile(dir=path):
+ pass
+ return True
+ except OSError:
+ return False
+
+
def current() -> SupportsRunContext:
"""Returns currently active run context"""
return Container()[PluggableRunContext].context
diff --git a/dlt/common/runtime/signals.py b/dlt/common/runtime/signals.py
index a8fa70936e..7212e32530 100644
--- a/dlt/common/runtime/signals.py
+++ b/dlt/common/runtime/signals.py
@@ -42,11 +42,17 @@ def sleep(sleep_seconds: float) -> None:
# do not allow sleeping if signal was received
raise_if_signalled()
# sleep or wait for signal
+ exit_event.clear()
exit_event.wait(sleep_seconds)
# if signal then raise
raise_if_signalled()
+def wake_all() -> None:
+ """Wakes all threads sleeping on event"""
+ exit_event.set()
+
+
@contextmanager
def delayed_signals() -> Iterator[None]:
"""Will delay signalling until `raise_if_signalled` is used or signalled `sleep`"""
diff --git a/dlt/common/runtime/typing.py b/dlt/common/runtime/typing.py
index eb167e9002..9a4d0f3d48 100644
--- a/dlt/common/runtime/typing.py
+++ b/dlt/common/runtime/typing.py
@@ -15,6 +15,7 @@
"colab",
"aws_lambda",
"gcp_cloud_function",
+ "streamlit",
]
diff --git a/dlt/common/storages/live_schema_storage.py b/dlt/common/storages/live_schema_storage.py
index 1ecc491174..b91ef5f07e 100644
--- a/dlt/common/storages/live_schema_storage.py
+++ b/dlt/common/storages/live_schema_storage.py
@@ -12,12 +12,20 @@ def __init__(self, config: SchemaStorageConfiguration, makedirs: bool = False) -
super().__init__(config, makedirs)
def __getitem__(self, name: str) -> Schema:
+ schema: Schema = None
if name in self.live_schemas:
schema = self.live_schemas[name]
if not self.is_live_schema_committed(name):
return schema
# return new schema instance
- schema = self.load_schema(name)
+ try:
+ schema = self.load_schema(name)
+ except SchemaNotFoundError:
+ # a committed live schema found that is not yet written to storage
+ # may happen when schema is passed explicitly via schema arg to run / pipeline
+ if schema:
+ return schema
+ raise
schema = self.set_live_schema(schema)
return schema
diff --git a/dlt/common/time.py b/dlt/common/time.py
index 26de0b5645..4ce411baa4 100644
--- a/dlt/common/time.py
+++ b/dlt/common/time.py
@@ -1,5 +1,6 @@
import contextlib
import datetime # noqa: I251
+import re
from typing import Any, Optional, Union, overload, TypeVar, Callable # noqa
from pendulum.parsing import (
@@ -154,6 +155,53 @@ def ensure_pendulum_time(value: Union[str, datetime.time]) -> pendulum.Time:
raise TypeError(f"Cannot coerce {value} to a pendulum.Time object.")
+def detect_datetime_format(value: str) -> Optional[str]:
+ format_patterns = {
+ # Full datetime with 'Z' (UTC) or timezone offset
+ re.compile(r"^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}Z$"): "%Y-%m-%dT%H:%M:%SZ", # UTC 'Z'
+ re.compile(
+ r"^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\.\d+Z$"
+ ): "%Y-%m-%dT%H:%M:%S.%fZ", # UTC with fractional seconds
+ re.compile(
+ r"^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\+\d{2}:\d{2}$"
+ ): "%Y-%m-%dT%H:%M:%S%z", # Timezone offset
+ re.compile(
+ r"^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\+\d{4}$"
+ ): "%Y-%m-%dT%H:%M:%S%z", # Timezone without colon
+ # Full datetime with fractional seconds and timezone
+ re.compile(
+ r"^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\.\d+\+\d{2}:\d{2}$"
+ ): "%Y-%m-%dT%H:%M:%S.%f%z",
+ re.compile(
+ r"^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\.\d+\+\d{4}$"
+ ): "%Y-%m-%dT%H:%M:%S.%f%z", # Timezone without colon
+ # Datetime without timezone
+ re.compile(r"^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}$"): "%Y-%m-%dT%H:%M:%S", # No timezone
+ re.compile(r"^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}$"): "%Y-%m-%dT%H:%M", # Minute precision
+ re.compile(r"^\d{4}-\d{2}-\d{2}T\d{2}$"): "%Y-%m-%dT%H", # Hour precision
+ # Date-only formats
+ re.compile(r"^\d{4}-\d{2}-\d{2}$"): "%Y-%m-%d", # Date only
+ re.compile(r"^\d{4}-\d{2}$"): "%Y-%m", # Year and month
+ re.compile(r"^\d{4}$"): "%Y", # Year only
+ # Week-based date formats
+ re.compile(r"^\d{4}-W\d{2}$"): "%Y-W%W", # Week-based date
+ re.compile(r"^\d{4}-W\d{2}-\d{1}$"): "%Y-W%W-%u", # Week-based date with day
+ # Ordinal date formats (day of year)
+ re.compile(r"^\d{4}-\d{3}$"): "%Y-%j", # Ordinal date
+ # Compact formats (no dashes)
+ re.compile(r"^\d{8}$"): "%Y%m%d", # Compact date format
+ re.compile(r"^\d{6}$"): "%Y%m", # Compact year and month format
+ }
+
+ # Match against each compiled regular expression
+ for pattern, format_str in format_patterns.items():
+ if pattern.match(value):
+ return format_str
+
+ # Return None if no pattern matches
+ return None
+
+
def to_py_datetime(value: datetime.datetime) -> datetime.datetime:
"""Convert a pendulum.DateTime to a py datetime object.
diff --git a/dlt/common/utils.py b/dlt/common/utils.py
index be8b28fc6b..170baf6f9f 100644
--- a/dlt/common/utils.py
+++ b/dlt/common/utils.py
@@ -24,6 +24,7 @@
Sequence,
Set,
Tuple,
+ Type,
TypeVar,
Mapping,
List,
@@ -41,6 +42,7 @@
T = TypeVar("T")
+TObj = TypeVar("TObj", bound=object)
TDict = TypeVar("TDict", bound=MutableMapping[Any, Any])
TKey = TypeVar("TKey")
@@ -504,6 +506,20 @@ def without_none(d: Mapping[TKey, Optional[TValue]]) -> Mapping[TKey, TValue]:
return {k: v for k, v in d.items() if v is not None}
+def exclude_keys(mapping: Mapping[str, Any], keys: Iterable[str]) -> Dict[str, Any]:
+ """Create a new dictionary from the input mapping, excluding specified keys.
+
+ Args:
+ mapping (Mapping[str, Any]): The input mapping from which keys will be excluded.
+ keys (Iterable[str]): The keys to exclude.
+
+ Returns:
+ Dict[str, Any]: A new dictionary containing all key-value pairs from the original
+ mapping except those with keys specified in `keys`.
+ """
+ return {k: v for k, v in mapping.items() if k not in keys}
+
+
def get_full_class_name(obj: Any) -> str:
cls = obj.__class__
module = cls.__module__
@@ -605,3 +621,17 @@ def assert_min_pkg_version(pkg_name: str, version: str, msg: str = "") -> None:
version_required=">=" + version,
appendix=msg,
)
+
+
+def make_defunct_class(cls: TObj) -> Type[TObj]:
+ class DefunctClass(cls.__class__): # type: ignore[name-defined]
+ """A defunct class to replace __class__ when we want to destroy current instance"""
+
+ def __getattribute__(self, name: str) -> Any:
+ if name == "__class__":
+ # Allow access to __class__
+ return object.__getattribute__(self, name)
+ else:
+ raise RuntimeError("This instance has been dropped and cannot be used anymore.")
+
+ return DefunctClass
diff --git a/dlt/destinations/dataset.py b/dlt/destinations/dataset.py
index 40583c6a9c..cffdc0f059 100644
--- a/dlt/destinations/dataset.py
+++ b/dlt/destinations/dataset.py
@@ -1,5 +1,8 @@
-from typing import Any, Generator, Optional, Union
+from typing import Any, Generator, Optional, Sequence, Union, List
from dlt.common.json import json
+from copy import deepcopy
+
+from dlt.common.normalizers.naming.naming import NamingConvention
from contextlib import contextmanager
from dlt.common.destination.reference import (
@@ -16,20 +19,53 @@
from dlt.common.schema.typing import TTableSchemaColumns
from dlt.destinations.sql_client import SqlClientBase, WithSqlClient
from dlt.common.schema import Schema
+from dlt.common.exceptions import DltException
+
+
+class DatasetException(DltException):
+ pass
+
+
+class ReadableRelationHasQueryException(DatasetException):
+ def __init__(self, attempted_change: str) -> None:
+ msg = (
+ "This readable relation was created with a provided sql query. You cannot change"
+ f" {attempted_change}. Please change the orignal sql query."
+ )
+ super().__init__(msg)
+
+
+class ReadableRelationUnknownColumnException(DatasetException):
+ def __init__(self, column_name: str) -> None:
+ msg = (
+ f"The selected column {column_name} is not known in the dlt schema for this releation."
+ )
+ super().__init__(msg)
class ReadableDBAPIRelation(SupportsReadableRelation):
def __init__(
self,
*,
- client: SqlClientBase[Any],
- query: Any,
- schema_columns: TTableSchemaColumns = None,
+ readable_dataset: "ReadableDBAPIDataset",
+ provided_query: Any = None,
+ table_name: str = None,
+ limit: int = None,
+ selected_columns: Sequence[str] = None,
) -> None:
"""Create a lazy evaluated relation to for the dataset of a destination"""
- self.client = client
- self.schema_columns = schema_columns
- self.query = query
+
+ # NOTE: we can keep an assertion here, this class will not be created by the user
+ assert bool(table_name) != bool(
+ provided_query
+ ), "Please provide either an sql query OR a table_name"
+
+ self._dataset = readable_dataset
+
+ self._provided_query = provided_query
+ self._table_name = table_name
+ self._limit = limit
+ self._selected_columns = selected_columns
# wire protocol functions
self.df = self._wrap_func("df") # type: ignore
@@ -42,18 +78,83 @@ def __init__(
self.iter_arrow = self._wrap_iter("iter_arrow") # type: ignore
self.iter_fetch = self._wrap_iter("iter_fetch") # type: ignore
+ @property
+ def sql_client(self) -> SqlClientBase[Any]:
+ return self._dataset.sql_client
+
+ @property
+ def schema(self) -> Schema:
+ return self._dataset.schema
+
+ @property
+ def query(self) -> Any:
+ """build the query"""
+ if self._provided_query:
+ return self._provided_query
+
+ table_name = self.sql_client.make_qualified_table_name(
+ self.schema.naming.normalize_path(self._table_name)
+ )
+
+ maybe_limit_clause_1 = ""
+ maybe_limit_clause_2 = ""
+ if self._limit:
+ maybe_limit_clause_1, maybe_limit_clause_2 = self.sql_client._limit_clause_sql(
+ self._limit
+ )
+
+ selector = "*"
+ if self._selected_columns:
+ selector = ",".join(
+ [
+ self.sql_client.escape_column_name(self.schema.naming.normalize_path(c))
+ for c in self._selected_columns
+ ]
+ )
+
+ return f"SELECT {maybe_limit_clause_1} {selector} FROM {table_name} {maybe_limit_clause_2}"
+
+ @property
+ def columns_schema(self) -> TTableSchemaColumns:
+ return self.compute_columns_schema()
+
+ @columns_schema.setter
+ def columns_schema(self, new_value: TTableSchemaColumns) -> None:
+ raise NotImplementedError("columns schema in ReadableDBAPIRelation can only be computed")
+
+ def compute_columns_schema(self) -> TTableSchemaColumns:
+ """provide schema columns for the cursor, may be filtered by selected columns"""
+
+ columns_schema = (
+ self.schema.tables.get(self._table_name, {}).get("columns", {}) if self.schema else {}
+ )
+
+ if not columns_schema:
+ return None
+ if not self._selected_columns:
+ return columns_schema
+
+ filtered_columns: TTableSchemaColumns = {}
+ for sc in self._selected_columns:
+ sc = self.schema.naming.normalize_path(sc)
+ if sc not in columns_schema.keys():
+ raise ReadableRelationUnknownColumnException(sc)
+ filtered_columns[sc] = columns_schema[sc]
+
+ return filtered_columns
+
@contextmanager
def cursor(self) -> Generator[SupportsReadableRelation, Any, Any]:
"""Gets a DBApiCursor for the current relation"""
- with self.client as client:
+ with self.sql_client as client:
# this hacky code is needed for mssql to disable autocommit, read iterators
# will not work otherwise. in the future we should be able to create a readony
# client which will do this automatically
- if hasattr(self.client, "_conn") and hasattr(self.client._conn, "autocommit"):
- self.client._conn.autocommit = False
+ if hasattr(self.sql_client, "_conn") and hasattr(self.sql_client._conn, "autocommit"):
+ self.sql_client._conn.autocommit = False
with client.execute_query(self.query) as cursor:
- if self.schema_columns:
- cursor.schema_columns = self.schema_columns
+ if columns_schema := self.columns_schema:
+ cursor.columns_schema = columns_schema
yield cursor
def _wrap_iter(self, func_name: str) -> Any:
@@ -74,6 +175,43 @@ def _wrap(*args: Any, **kwargs: Any) -> Any:
return _wrap
+ def __copy__(self) -> "ReadableDBAPIRelation":
+ return self.__class__(
+ readable_dataset=self._dataset,
+ provided_query=self._provided_query,
+ table_name=self._table_name,
+ limit=self._limit,
+ selected_columns=self._selected_columns,
+ )
+
+ def limit(self, limit: int) -> "ReadableDBAPIRelation":
+ if self._provided_query:
+ raise ReadableRelationHasQueryException("limit")
+ rel = self.__copy__()
+ rel._limit = limit
+ return rel
+
+ def select(self, *columns: str) -> "ReadableDBAPIRelation":
+ if self._provided_query:
+ raise ReadableRelationHasQueryException("select")
+ rel = self.__copy__()
+ rel._selected_columns = columns
+ # NOTE: the line below will ensure that no unknown columns are selected if
+ # schema is known
+ rel.compute_columns_schema()
+ return rel
+
+ def __getitem__(self, columns: Union[str, Sequence[str]]) -> "SupportsReadableRelation":
+ if isinstance(columns, str):
+ return self.select(columns)
+ elif isinstance(columns, Sequence):
+ return self.select(*columns)
+ else:
+ raise TypeError(f"Invalid argument type: {type(columns).__name__}")
+
+ def head(self, limit: int = 5) -> "ReadableDBAPIRelation":
+ return self.limit(limit)
+
class ReadableDBAPIDataset(SupportsReadableDataset):
"""Access to dataframes and arrowtables in the destination dataset via dbapi"""
@@ -145,20 +283,14 @@ def _ensure_client_and_schema(self) -> None:
" SqlClient."
)
- def __call__(
- self, query: Any, schema_columns: TTableSchemaColumns = None
- ) -> ReadableDBAPIRelation:
- schema_columns = schema_columns or {}
- return ReadableDBAPIRelation(client=self.sql_client, query=query, schema_columns=schema_columns) # type: ignore[abstract]
+ def __call__(self, query: Any) -> ReadableDBAPIRelation:
+ return ReadableDBAPIRelation(readable_dataset=self, provided_query=query) # type: ignore[abstract]
def table(self, table_name: str) -> SupportsReadableRelation:
- # prepare query for table relation
- schema_columns = (
- self.schema.tables.get(table_name, {}).get("columns", {}) if self.schema else {}
- )
- table_name = self.sql_client.make_qualified_table_name(table_name)
- query = f"SELECT * FROM {table_name}"
- return self(query, schema_columns)
+ return ReadableDBAPIRelation(
+ readable_dataset=self,
+ table_name=table_name,
+ ) # type: ignore[abstract]
def __getitem__(self, table_name: str) -> SupportsReadableRelation:
"""access of table via dict notation"""
diff --git a/dlt/destinations/impl/bigquery/bigquery.py b/dlt/destinations/impl/bigquery/bigquery.py
index d01b54740e..2b3927e7c9 100644
--- a/dlt/destinations/impl/bigquery/bigquery.py
+++ b/dlt/destinations/impl/bigquery/bigquery.py
@@ -49,6 +49,7 @@
from dlt.destinations.job_impl import DestinationJsonlLoadJob, DestinationParquetLoadJob
from dlt.destinations.job_impl import ReferenceFollowupJobRequest
from dlt.destinations.sql_jobs import SqlMergeFollowupJob
+from dlt.destinations.sql_client import SqlClientBase
class BigQueryLoadJob(RunnableLoadJob, HasFollowupJobs):
@@ -124,15 +125,17 @@ def run(self) -> None:
)
def exception(self) -> str:
- return json.dumps(
- {
- "error_result": self._bq_load_job.error_result,
- "errors": self._bq_load_job.errors,
- "job_start": self._bq_load_job.started,
- "job_end": self._bq_load_job.ended,
- "job_id": self._bq_load_job.job_id,
- }
- )
+ if self._bq_load_job:
+ return json.dumps(
+ {
+ "error_result": self._bq_load_job.error_result,
+ "errors": self._bq_load_job.errors,
+ "job_start": self._bq_load_job.started,
+ "job_end": self._bq_load_job.ended,
+ "job_id": self._bq_load_job.job_id,
+ }
+ )
+ return super().exception()
@staticmethod
def get_job_id_from_file_path(file_path: str) -> str:
@@ -140,6 +143,18 @@ def get_job_id_from_file_path(file_path: str) -> str:
class BigQueryMergeJob(SqlMergeFollowupJob):
+ @classmethod
+ def _gen_table_setup_clauses(
+ cls, table_chain: Sequence[PreparedTableSchema], sql_client: SqlClientBase[Any]
+ ) -> List[str]:
+ """generate final tables from staging table schema for autodetect tables"""
+ sql: List[str] = []
+ for table in table_chain:
+ if should_autodetect_schema(table):
+ table_name, staging_table_name = sql_client.get_qualified_table_names(table["name"])
+ sql.append(f"CREATE TABLE IF NOT EXISTS {table_name} LIKE {staging_table_name};")
+ return sql
+
@classmethod
def gen_key_table_clauses(
cls,
@@ -183,6 +198,19 @@ def _create_merge_followup_jobs(
) -> List[FollowupJobRequest]:
return [BigQueryMergeJob.from_table_chain(table_chain, self.sql_client)]
+ def initialize_storage(self, truncate_tables: Iterable[str] = None) -> None:
+ truncate_tables = truncate_tables or []
+
+ # split array into tables that have autodetect schema and those that don't
+ autodetect_tables = [
+ t for t in truncate_tables if should_autodetect_schema(self.prepare_load_table(t))
+ ]
+ non_autodetect_tables = [t for t in truncate_tables if t not in autodetect_tables]
+
+ # if any table has schema autodetect, we need to make sure to only truncate tables that exist
+ super().initialize_storage(truncate_tables=non_autodetect_tables)
+ self.sql_client.truncate_tables_if_exist(*autodetect_tables)
+
def create_load_job(
self, table: PreparedTableSchema, file_path: str, load_id: str, restore: bool = False
) -> LoadJob:
diff --git a/dlt/destinations/impl/bigquery/sql_client.py b/dlt/destinations/impl/bigquery/sql_client.py
index 650db1d8b9..6911fa5c1c 100644
--- a/dlt/destinations/impl/bigquery/sql_client.py
+++ b/dlt/destinations/impl/bigquery/sql_client.py
@@ -273,6 +273,20 @@ def _make_database_exception(cls, ex: Exception) -> Exception:
# anything else is transient
return DatabaseTransientException(ex)
+ def truncate_tables_if_exist(self, *tables: str) -> None:
+ """NOTE: We only truncate tables that exist, for auto-detect schema we don't know which tables exist"""
+ statements: List[str] = ["DECLARE table_exists BOOL;"]
+ for t in tables:
+ table_name = self.make_qualified_table_name(t)
+ statements.append(
+ "SET table_exists = (SELECT COUNT(*) > 0 FROM"
+ f" `{self.project_id}.{self.dataset_name}.INFORMATION_SCHEMA.TABLES` WHERE"
+ f" table_name = '{t}');"
+ )
+ truncate_stmt = self._truncate_table_sql(table_name).replace(";", "")
+ statements.append(f"IF table_exists THEN EXECUTE IMMEDIATE '{truncate_stmt}'; END IF;")
+ self.execute_many(statements)
+
@staticmethod
def _get_reason_from_errors(gace: api_core_exceptions.GoogleAPICallError) -> Optional[str]:
errors: List[StrAny] = getattr(gace, "errors", None)
diff --git a/dlt/destinations/impl/clickhouse/clickhouse.py b/dlt/destinations/impl/clickhouse/clickhouse.py
index 603215d889..dba0a8667d 100644
--- a/dlt/destinations/impl/clickhouse/clickhouse.py
+++ b/dlt/destinations/impl/clickhouse/clickhouse.py
@@ -1,5 +1,4 @@
import os
-import re
from copy import deepcopy
from textwrap import dedent
from typing import Optional, List, Sequence, cast
@@ -8,7 +7,6 @@
import clickhouse_connect
from clickhouse_connect.driver.tools import insert_file
-from dlt import config
from dlt.common.configuration.specs import (
CredentialsConfiguration,
AzureCredentialsWithoutDefaults,
@@ -31,6 +29,7 @@
)
from dlt.common.schema.utils import is_nullable_column
from dlt.common.storages import FileStorage
+from dlt.common.storages.configuration import FilesystemConfiguration
from dlt.destinations.exceptions import LoadJobTerminalException
from dlt.destinations.impl.clickhouse.configuration import (
ClickHouseClientConfiguration,
@@ -55,6 +54,7 @@
)
from dlt.destinations.job_impl import ReferenceFollowupJobRequest, FinalizedLoadJobWithFollowupJobs
from dlt.destinations.sql_jobs import SqlMergeFollowupJob
+from dlt.destinations.utils import is_compression_disabled
class ClickHouseLoadJob(RunnableLoadJob, HasFollowupJobs):
@@ -88,31 +88,17 @@ def run(self) -> None:
compression = "auto"
# Don't use the DBAPI driver for local files.
- if not bucket_path:
+ if not bucket_path or bucket_scheme == "file":
+ file_path = (
+ self._file_path
+ if not bucket_path
+ else FilesystemConfiguration.make_local_path(bucket_path)
+ )
# Local filesystem.
if ext == "jsonl":
- compression = "gz" if FileStorage.is_gzipped(self._file_path) else "none"
+ compression = "gz" if FileStorage.is_gzipped(file_path) else "none"
try:
- with clickhouse_connect.create_client(
- host=client.credentials.host,
- port=client.credentials.http_port,
- database=client.credentials.database,
- user_name=client.credentials.username,
- password=client.credentials.password,
- secure=bool(client.credentials.secure),
- ) as clickhouse_connect_client:
- insert_file(
- clickhouse_connect_client,
- qualified_table_name,
- self._file_path,
- fmt=clickhouse_format,
- settings={
- "allow_experimental_lightweight_delete": 1,
- "enable_http_compression": 1,
- "date_time_input_format": "best_effort",
- },
- compression=compression,
- )
+ client.insert_file(file_path, self.load_table_name, clickhouse_format, compression)
except clickhouse_connect.driver.exceptions.Error as e:
raise LoadJobTerminalException(
self._file_path,
@@ -124,7 +110,7 @@ def run(self) -> None:
# NOTE: we should not really be accessing the config this way, but for
# now it is ok...
if ext == "jsonl":
- compression = "none" if config.get("data_writer.disable_compression") else "gz"
+ compression = "none" if is_compression_disabled() else "gz"
if bucket_scheme in ("s3", "gs", "gcs"):
if not isinstance(self._staging_credentials, AwsCredentialsWithoutDefaults):
@@ -216,6 +202,7 @@ def __init__(
self.sql_client: ClickHouseSqlClient = ClickHouseSqlClient(
config.normalize_dataset_name(schema),
config.normalize_staging_dataset_name(schema),
+ list(schema.tables.keys()),
config.credentials,
capabilities,
config,
diff --git a/dlt/destinations/impl/clickhouse/configuration.py b/dlt/destinations/impl/clickhouse/configuration.py
index 7acfc08885..adcc2a3e4c 100644
--- a/dlt/destinations/impl/clickhouse/configuration.py
+++ b/dlt/destinations/impl/clickhouse/configuration.py
@@ -1,8 +1,10 @@
import dataclasses
from typing import ClassVar, Dict, List, Any, Final, cast, Optional
+from typing_extensions import Annotated
from dlt.common.configuration import configspec
from dlt.common.configuration.specs import ConnectionStringCredentials
+from dlt.common.configuration.specs.base_configuration import NotResolved
from dlt.common.destination.reference import (
DestinationClientDwhWithStagingConfiguration,
)
@@ -69,6 +71,10 @@ class ClickHouseClientConfiguration(DestinationClientDwhWithStagingConfiguration
destination_type: Final[str] = dataclasses.field( # type: ignore[misc]
default="clickhouse", init=False, repr=False, compare=False
)
+ # allow empty dataset names
+ dataset_name: Annotated[Optional[str], NotResolved()] = dataclasses.field(
+ default=None, init=False, repr=False, compare=False
+ )
credentials: ClickHouseCredentials = None
dataset_table_separator: str = "___"
diff --git a/dlt/destinations/impl/clickhouse/sql_client.py b/dlt/destinations/impl/clickhouse/sql_client.py
index 25914e4093..00f35da082 100644
--- a/dlt/destinations/impl/clickhouse/sql_client.py
+++ b/dlt/destinations/impl/clickhouse/sql_client.py
@@ -1,5 +1,14 @@
import datetime # noqa: I251
+
from clickhouse_driver import dbapi as clickhouse_dbapi # type: ignore[import-untyped]
+import clickhouse_driver
+import clickhouse_driver.errors # type: ignore[import-untyped]
+from clickhouse_driver.dbapi import OperationalError # type: ignore[import-untyped]
+from clickhouse_driver.dbapi.extras import DictCursor # type: ignore[import-untyped]
+import clickhouse_connect
+from clickhouse_connect.driver.tools import insert_file as clk_insert_file
+from clickhouse_connect.driver.summary import QuerySummary
+
from contextlib import contextmanager
from typing import (
Iterator,
@@ -14,14 +23,12 @@
cast,
)
-import clickhouse_driver
-import clickhouse_driver.errors # type: ignore[import-untyped]
-from clickhouse_driver.dbapi import OperationalError # type: ignore[import-untyped]
-from clickhouse_driver.dbapi.extras import DictCursor # type: ignore[import-untyped]
from pendulum import DateTime # noqa: I251
+from dlt.common import logger
from dlt.common.destination import DestinationCapabilitiesContext
from dlt.common.typing import DictStrAny
+
from dlt.destinations.exceptions import (
DatabaseUndefinedRelation,
DatabaseTransientException,
@@ -62,14 +69,16 @@ class ClickHouseSqlClient(
def __init__(
self,
- dataset_name: str,
+ dataset_name: Optional[str],
staging_dataset_name: str,
+ known_table_names: List[str],
credentials: ClickHouseCredentials,
capabilities: DestinationCapabilitiesContext,
config: ClickHouseClientConfiguration,
) -> None:
super().__init__(credentials.database, dataset_name, staging_dataset_name, capabilities)
self._conn: clickhouse_driver.dbapi.connection = None
+ self.known_table_names = known_table_names
self.credentials = credentials
self.database_name = credentials.database
self.config = config
@@ -77,9 +86,14 @@ def __init__(
def has_dataset(self) -> bool:
# we do not need to normalize dataset_sentinel_table_name.
sentinel_table = self.config.dataset_sentinel_table_name
- return sentinel_table in [
- t.split(self.config.dataset_table_separator)[1] for t in self._list_tables()
- ]
+ all_ds_tables = self._list_tables()
+ if self.dataset_name:
+ return sentinel_table in [
+ t.split(self.config.dataset_table_separator)[1] for t in all_ds_tables
+ ]
+ else:
+ # if no dataset specified we look for sentinel table
+ return sentinel_table in all_ds_tables
def open_connection(self) -> clickhouse_driver.dbapi.connection.Connection:
self._conn = clickhouse_driver.connect(dsn=self.credentials.to_native_representation())
@@ -131,20 +145,42 @@ def drop_dataset(self) -> None:
sentinel_table_name = self.make_qualified_table_name(
self.config.dataset_sentinel_table_name
)
- # drop a sentinel table
- self.execute_sql(f"DROP TABLE {sentinel_table_name} SYNC")
-
- # Since ClickHouse doesn't have schemas, we need to drop all tables in our virtual schema,
- # or collection of tables, that has the `dataset_name` as a prefix.
- to_drop_results = [
- f"{self.catalog_name()}.{self.capabilities.escape_identifier(table)}"
- for table in self._list_tables()
- ]
+
+ all_ds_tables = self._list_tables()
+
+ if self.dataset_name:
+ # Since ClickHouse doesn't have schemas, we need to drop all tables in our virtual schema,
+ # or collection of tables, that has the `dataset_name` as a prefix.
+ to_drop_results = all_ds_tables
+ else:
+ # drop only tables known in logical (dlt) schema
+ to_drop_results = [
+ table_name for table_name in self.known_table_names if table_name in all_ds_tables
+ ]
+
+ catalog_name = self.catalog_name()
+ # drop a sentinel table only when dataset name was empty (was not included in the schema)
+ if not self.dataset_name:
+ self.execute_sql(f"DROP TABLE {sentinel_table_name} SYNC")
+ logger.warning(
+ "Dataset without name (tables without prefix) got dropped. Only tables known in the"
+ " current dlt schema and sentinel tables were removed."
+ )
+ else:
+ sentinel_table_name = self.make_qualified_table_name_path(
+ self.config.dataset_sentinel_table_name, escape=False
+ )[-1]
+ if sentinel_table_name not in all_ds_tables:
+ # no sentinel table, dataset does not exist
+ self.execute_sql(f"SELECT 1 FROM {sentinel_table_name}")
+ raise AssertionError(f"{sentinel_table_name} must not exist")
for table in to_drop_results:
# The "DROP TABLE" clause is discarded if we allow clickhouse_driver to handle parameter substitution.
# This is because the driver incorrectly substitutes the entire query string, causing the "DROP TABLE" keyword to be omitted.
# To resolve this, we are forced to provide the full query string here.
- self.execute_sql(f"DROP TABLE {table} SYNC")
+ self.execute_sql(
+ f"DROP TABLE {catalog_name}.{self.capabilities.escape_identifier(table)} SYNC"
+ )
def drop_tables(self, *tables: str) -> None:
"""Drops a set of tables if they exist"""
@@ -156,6 +192,30 @@ def drop_tables(self, *tables: str) -> None:
]
self.execute_many(statements)
+ def insert_file(
+ self, file_path: str, table_name: str, file_format: str, compression: str
+ ) -> QuerySummary:
+ with clickhouse_connect.create_client(
+ host=self.credentials.host,
+ port=self.credentials.http_port,
+ database=self.credentials.database,
+ user_name=self.credentials.username,
+ password=self.credentials.password,
+ secure=bool(self.credentials.secure),
+ ) as clickhouse_connect_client:
+ return clk_insert_file(
+ clickhouse_connect_client,
+ self.make_qualified_table_name(table_name),
+ file_path,
+ fmt=file_format,
+ settings={
+ "allow_experimental_lightweight_delete": 1,
+ "enable_http_compression": 1,
+ "date_time_input_format": "best_effort",
+ },
+ compression=compression,
+ )
+
def _list_tables(self) -> List[str]:
catalog_name, table_name = self.make_qualified_table_name_path("%", escape=False)
rows = self.execute_sql(
@@ -217,9 +277,13 @@ def make_qualified_table_name_path(
path = super().make_qualified_table_name_path(None, escape=escape)
if table_name:
# table name combines dataset name and table name
- table_name = self.capabilities.casefold_identifier(
- f"{self.dataset_name}{self.config.dataset_table_separator}{table_name}"
- )
+ if self.dataset_name:
+ table_name = self.capabilities.casefold_identifier(
+ f"{self.dataset_name}{self.config.dataset_table_separator}{table_name}"
+ )
+ else:
+ # without dataset just use the table name
+ table_name = self.capabilities.casefold_identifier(table_name)
if escape:
table_name = self.capabilities.escape_identifier(table_name)
# we have only two path components
diff --git a/dlt/destinations/impl/databricks/configuration.py b/dlt/destinations/impl/databricks/configuration.py
index 789dbedae9..c95b6eba4c 100644
--- a/dlt/destinations/impl/databricks/configuration.py
+++ b/dlt/destinations/impl/databricks/configuration.py
@@ -2,11 +2,13 @@
from typing import ClassVar, Final, Optional, Any, Dict, List
from dlt.common.typing import TSecretStrValue
-from dlt.common.configuration.exceptions import ConfigurationValueError
from dlt.common.configuration.specs.base_configuration import CredentialsConfiguration, configspec
from dlt.common.destination.reference import DestinationClientDwhWithStagingConfiguration
+DATABRICKS_APPLICATION_ID = "dltHub_dlt"
+
+
@configspec
class DatabricksCredentials(CredentialsConfiguration):
catalog: str = None
@@ -19,6 +21,7 @@ class DatabricksCredentials(CredentialsConfiguration):
connection_parameters: Optional[Dict[str, Any]] = None
"""Additional keyword arguments that are passed to `databricks.sql.connect`"""
socket_timeout: Optional[int] = 180
+ user_agent_entry: Optional[str] = DATABRICKS_APPLICATION_ID
__config_gen_annotations__: ClassVar[List[str]] = [
"server_hostname",
@@ -28,7 +31,7 @@ class DatabricksCredentials(CredentialsConfiguration):
]
def to_connector_params(self) -> Dict[str, Any]:
- return dict(
+ conn_params = dict(
catalog=self.catalog,
server_hostname=self.server_hostname,
http_path=self.http_path,
@@ -38,6 +41,13 @@ def to_connector_params(self) -> Dict[str, Any]:
**(self.connection_parameters or {}),
)
+ if self.user_agent_entry:
+ conn_params["_user_agent_entry"] = (
+ conn_params.get("_user_agent_entry") or self.user_agent_entry
+ )
+
+ return conn_params
+
@configspec
class DatabricksClientConfiguration(DestinationClientDwhWithStagingConfiguration):
diff --git a/dlt/destinations/impl/databricks/databricks.py b/dlt/destinations/impl/databricks/databricks.py
index fbf552d3b1..718427af87 100644
--- a/dlt/destinations/impl/databricks/databricks.py
+++ b/dlt/destinations/impl/databricks/databricks.py
@@ -1,7 +1,6 @@
from typing import Optional, Sequence, List, cast
from urllib.parse import urlparse, urlunparse
-from dlt import config
from dlt.common.configuration.specs.azure_credentials import (
AzureServicePrincipalCredentialsWithoutDefaults,
)
@@ -31,6 +30,7 @@
from dlt.destinations.impl.databricks.sql_client import DatabricksSqlClient
from dlt.destinations.sql_jobs import SqlMergeFollowupJob
from dlt.destinations.job_impl import ReferenceFollowupJobRequest
+from dlt.destinations.utils import is_compression_disabled
AZURE_BLOB_STORAGE_PROTOCOLS = ["az", "abfss", "abfs"]
SUPPORTED_BLOB_STORAGE_PROTOCOLS = AZURE_BLOB_STORAGE_PROTOCOLS + ["s3", "gs", "gcs"]
@@ -140,7 +140,7 @@ def run(self) -> None:
if file_name.endswith(".parquet"):
source_format = "PARQUET" # Only parquet is supported
elif file_name.endswith(".jsonl"):
- if not config.get("data_writer.disable_compression"):
+ if not is_compression_disabled():
raise LoadJobTerminalException(
self._file_path,
"Databricks loader does not support gzip compressed JSON files. Please disable"
@@ -224,7 +224,7 @@ def __init__(
)
super().__init__(schema, config, sql_client)
self.config: DatabricksClientConfiguration = config
- self.sql_client: DatabricksSqlClient = sql_client # type: ignore[assignment]
+ self.sql_client: DatabricksSqlClient = sql_client
self.type_mapper = self.capabilities.get_type_mapper()
def create_load_job(
diff --git a/dlt/destinations/impl/databricks/sql_client.py b/dlt/destinations/impl/databricks/sql_client.py
index 88d47410d5..8bff4e0d73 100644
--- a/dlt/destinations/impl/databricks/sql_client.py
+++ b/dlt/destinations/impl/databricks/sql_client.py
@@ -41,7 +41,7 @@
class DatabricksCursorImpl(DBApiCursorImpl):
"""Use native data frame support if available"""
- native_cursor: DatabricksSqlCursor # type: ignore[assignment]
+ native_cursor: DatabricksSqlCursor
vector_size: ClassVar[int] = 2048 # vector size is 2048
def iter_arrow(self, chunk_size: int) -> Generator[ArrowTable, None, None]:
@@ -144,7 +144,7 @@ def execute_query(self, query: AnyStr, *args: Any, **kwargs: Any) -> Iterator[DB
# db_args = kwargs or None
db_args = args or kwargs or None
- with self._conn.cursor() as curr: # type: ignore[assignment]
+ with self._conn.cursor() as curr:
curr.execute(query, db_args)
yield DatabricksCursorImpl(curr) # type: ignore[abstract]
diff --git a/dlt/destinations/impl/duckdb/configuration.py b/dlt/destinations/impl/duckdb/configuration.py
index 0f35770747..692d0eb8e3 100644
--- a/dlt/destinations/impl/duckdb/configuration.py
+++ b/dlt/destinations/impl/duckdb/configuration.py
@@ -1,15 +1,18 @@
import os
import dataclasses
import threading
-
from typing import Any, ClassVar, Dict, Final, List, Optional, Tuple, Type, Union
-
from pathvalidate import is_valid_filepath
+
from dlt.common import logger
+from dlt.common.typing import Annotated
from dlt.common.configuration import configspec
from dlt.common.configuration.specs import ConnectionStringCredentials
+from dlt.common.configuration.specs.base_configuration import NotResolved
from dlt.common.configuration.specs.exceptions import InvalidConnectionString
from dlt.common.destination.reference import DestinationClientDwhWithStagingConfiguration
+from dlt.common.pipeline import SupportsPipeline
+
from dlt.destinations.impl.duckdb.exceptions import InvalidInMemoryDuckdbCredentials
try:
@@ -19,7 +22,7 @@
DUCK_DB_NAME = "%s.duckdb"
DEFAULT_DUCK_DB_NAME = DUCK_DB_NAME % "quack"
-LOCAL_STATE_KEY = "duckdb_database"
+LEGACY_DB_PATH_LOCAL_STATE_KEY = "duckdb_database"
@configspec(init=False)
@@ -91,7 +94,7 @@ def _get_conn_config(self) -> Dict[str, Any]:
return {}
def _conn_str(self) -> str:
- return self.database
+ raise NotImplementedError()
def _delete_conn(self) -> None:
self._conn.close()
@@ -106,6 +109,7 @@ def __del__(self) -> None:
class DuckDbCredentials(DuckDbBaseCredentials):
drivername: Final[str] = dataclasses.field(default="duckdb", init=False, repr=False, compare=False) # type: ignore
username: Optional[str] = None
+ bound_to_pipeline: Annotated[Optional[SupportsPipeline], NotResolved()] = None
__config_gen_annotations__: ClassVar[List[str]] = []
@@ -120,49 +124,24 @@ def on_resolved(self) -> None:
if isinstance(self.database, str) and self.database == ":memory:":
raise InvalidInMemoryDuckdbCredentials()
+ def setup_database(self) -> None:
# do not set any paths for external database
if self.database == ":external:":
return
# try the pipeline context
- is_default_path = False
if self.database == ":pipeline:":
self.database = self._path_in_pipeline(DEFAULT_DUCK_DB_NAME)
else:
- # maybe get database
- maybe_database, maybe_is_default_path = self._path_from_pipeline(DEFAULT_DUCK_DB_NAME)
- # if pipeline context was not present or database was not set
- if not self.database or not maybe_is_default_path:
- # create database locally
- is_default_path = maybe_is_default_path
- self.database = maybe_database
-
- # always make database an abs path
- self.database = os.path.abspath(self.database)
- # do not save the default path into pipeline's local state
- if not is_default_path:
- self._path_to_pipeline(self.database)
+ self.database = self._path_from_pipeline(self.database, DEFAULT_DUCK_DB_NAME)
def _path_in_pipeline(self, rel_path: str) -> str:
- from dlt.common.configuration.container import Container
- from dlt.common.pipeline import PipelineContext
-
- context = Container()[PipelineContext]
- if context.is_active():
- # pipeline is active, get the working directory
- return os.path.join(context.pipeline().working_dir, rel_path)
+ if self.bound_to_pipeline:
+ return os.path.join(self.bound_to_pipeline.working_dir, rel_path)
raise RuntimeError(
"Attempting to use special duckdb database :pipeline: outside of pipeline context."
)
- def _path_to_pipeline(self, abspath: str) -> None:
- from dlt.common.configuration.container import Container
- from dlt.common.pipeline import PipelineContext
-
- context = Container()[PipelineContext]
- if context.is_active():
- context.pipeline().set_local_state_val(LOCAL_STATE_KEY, abspath)
-
- def _path_from_pipeline(self, default_path: str) -> Tuple[str, bool]:
+ def _path_from_pipeline(self, explicit_path: str, default_path: str) -> str:
"""
Returns path to DuckDB as stored in the active pipeline's local state and a boolean flag.
@@ -176,33 +155,42 @@ def _path_from_pipeline(self, default_path: str) -> Tuple[str, bool]:
Tuple[str, bool]: The path to the DuckDB as stored in the active pipeline's local state or the default path if not available,
and a boolean flag set to True when the default path is returned.
"""
- from dlt.common.configuration.container import Container
- from dlt.common.pipeline import PipelineContext
-
- context = Container()[PipelineContext]
- if context.is_active():
- try:
+ if self.bound_to_pipeline:
+ # backward compat - paths to duckdb were stored in local state and used if explicit path was not provided
+ pipeline_path: str = None
+ if not explicit_path:
+ try:
+ pipeline_path = self.bound_to_pipeline.get_local_state_val(
+ LEGACY_DB_PATH_LOCAL_STATE_KEY
+ )
+ except KeyError:
+ # no local state: default_path will be used
+ pass
+ if not pipeline_path:
+ # get initial cwd
+ initial_cwd = self.bound_to_pipeline.get_local_state_val("initial_cwd")
# use pipeline name as default
- pipeline = context.pipeline()
- default_path = DUCK_DB_NAME % pipeline.pipeline_name
- # get pipeline path from local state
- pipeline_path = pipeline.get_local_state_val(LOCAL_STATE_KEY)
- # make sure that path exists
+ pipeline_path = explicit_path or DUCK_DB_NAME % self.bound_to_pipeline.pipeline_name
+ # if explicit_path was an absolute path it will be used
+ pipeline_path = os.path.join(initial_cwd, pipeline_path)
+ if not self.bound_to_pipeline.first_run:
if not os.path.exists(pipeline_path):
logger.warning(
- f"Duckdb attached to pipeline {pipeline.pipeline_name} in path"
- f" {os.path.relpath(pipeline_path)} was deleted. Attaching to duckdb"
- f" database '{default_path}' in current folder."
+ f"Duckdb attached to pipeline {self.bound_to_pipeline.pipeline_name} in"
+ f" path {os.path.relpath(pipeline_path)} was could not be found but"
+ " pipeline has already ran. This may be a result of (1) recreating or"
+ " attaching pipeline without or with changed explicit path to database"
+ " that was used when creating the pipeline. (2) keeping the path to to"
+ " database in secrets and changing the current working folder so dlt"
+ " cannot see them. (3) you deleting the database."
)
- else:
- return pipeline_path, False
- except KeyError:
- # no local state: default_path will be used
- pass
+ return pipeline_path
- return default_path, True
+ return os.path.abspath(explicit_path or default_path)
def _conn_str(self) -> str:
+ if not self.database or not os.path.abspath(self.database):
+ self.setup_database()
return self.database
def __init__(self, conn_or_path: Union[str, DuckDBPyConnection] = None) -> None:
@@ -214,6 +202,7 @@ def __init__(self, conn_or_path: Union[str, DuckDBPyConnection] = None) -> None:
class DuckDbClientConfiguration(DestinationClientDwhWithStagingConfiguration):
destination_type: Final[str] = dataclasses.field(default="duckdb", init=False, repr=False, compare=False) # type: ignore
credentials: DuckDbCredentials = None
+ bound_to_pipeline: Annotated[Optional[SupportsPipeline], NotResolved()] = None
create_indexes: bool = (
False # should unique indexes be created, this slows loading down massively
@@ -226,10 +215,18 @@ def __init__(
create_indexes: bool = False,
destination_name: str = None,
environment: str = None,
+ bound_to_pipeline: Optional[SupportsPipeline] = None,
) -> None:
super().__init__(
credentials=credentials, # type: ignore[arg-type]
destination_name=destination_name,
environment=environment,
)
+ self.bound_to_pipeline = bound_to_pipeline
self.create_indexes = create_indexes
+
+ def on_resolved(self) -> None:
+ # pass bound pipeline to duckdb credentials
+ # TODO: find a better way to pass and bind explicit pipeline context
+ self.credentials.bound_to_pipeline = self.bound_to_pipeline
+ self.credentials.setup_database()
diff --git a/dlt/destinations/impl/duckdb/factory.py b/dlt/destinations/impl/duckdb/factory.py
index e3d261d9d6..2fec3a3054 100644
--- a/dlt/destinations/impl/duckdb/factory.py
+++ b/dlt/destinations/impl/duckdb/factory.py
@@ -6,6 +6,7 @@
from dlt.common.arithmetics import DEFAULT_NUMERIC_PRECISION, DEFAULT_NUMERIC_SCALE
from dlt.common.destination.typing import PreparedTableSchema
from dlt.common.exceptions import TerminalValueError
+from dlt.common.pipeline import SupportsPipeline
from dlt.common.schema.typing import TColumnSchema, TColumnType
from dlt.destinations.type_mapping import TypeMapperImpl
from dlt.destinations.impl.duckdb.configuration import DuckDbCredentials, DuckDbClientConfiguration
@@ -86,7 +87,7 @@ def to_db_datetime_type(
timezone = column.get("timezone", True)
precision = column.get("precision")
- if timezone and precision is not None:
+ if timezone and precision is not None and precision != 6:
logger.warn(
f"DuckDB does not support both timezone and precision for column '{column_name}' in"
f" table '{table_name}'. Will default to timezone. Please set timezone to False to"
@@ -166,6 +167,7 @@ def __init__(
create_indexes: bool = False,
destination_name: t.Optional[str] = None,
environment: t.Optional[str] = None,
+ bound_to_pipeline: t.Optional[SupportsPipeline] = None,
**kwargs: t.Any,
) -> None:
"""Configure the DuckDB destination to use in a pipeline.
@@ -177,6 +179,7 @@ def __init__(
a path to a database file. Use :pipeline: to create a duckdb
in the working folder of the pipeline
create_indexes: Should unique indexes be created, defaults to False
+ bound_to_pipeline: Bind the connections generates by this factory to this pipeline, to enable :pipeline: path
**kwargs: Additional arguments passed to the destination config
"""
super().__init__(
@@ -184,5 +187,6 @@ def __init__(
create_indexes=create_indexes,
destination_name=destination_name,
environment=environment,
+ bound_to_pipeline=bound_to_pipeline,
**kwargs,
)
diff --git a/dlt/destinations/impl/dummy/dummy.py b/dlt/destinations/impl/dummy/dummy.py
index 72563e903d..aec5a80b7d 100644
--- a/dlt/destinations/impl/dummy/dummy.py
+++ b/dlt/destinations/impl/dummy/dummy.py
@@ -12,7 +12,6 @@
Iterable,
List,
)
-import os
import time
from dlt.common.metrics import LoadJobMetrics
from dlt.common.pendulum import pendulum
diff --git a/dlt/destinations/impl/filesystem/sql_client.py b/dlt/destinations/impl/filesystem/sql_client.py
index 87aa254e96..fec761ff36 100644
--- a/dlt/destinations/impl/filesystem/sql_client.py
+++ b/dlt/destinations/impl/filesystem/sql_client.py
@@ -3,8 +3,6 @@
import os
import re
-import dlt
-
import duckdb
import sqlglot
@@ -14,7 +12,6 @@
from contextlib import contextmanager
from dlt.common.destination.reference import DBApiCursor
-from dlt.common.destination.typing import PreparedTableSchema
from dlt.destinations.sql_client import raise_database_error
@@ -25,6 +22,9 @@
AzureServicePrincipalCredentialsWithoutDefaults,
AzureCredentialsWithoutDefaults,
)
+from dlt.destinations.utils import is_compression_disabled
+
+from pathlib import Path
SUPPORTED_PROTOCOLS = ["gs", "gcs", "s3", "file", "memory", "az", "abfss"]
@@ -75,9 +75,32 @@ def drop_authentication(self, secret_name: str = None) -> None:
self._conn.sql(f"DROP PERSISTENT SECRET IF EXISTS {secret_name}")
def create_authentication(self, persistent: bool = False, secret_name: str = None) -> None:
+ # home dir is a bad choice, it should be more explicit
if not secret_name:
secret_name = self._create_default_secret_name()
+ if persistent and self.memory_db:
+ raise Exception("Creating persistent secrets for in memory db is not allowed.")
+
+ secrets_path = Path(
+ self._conn.sql(
+ "SELECT current_setting('secret_directory') AS secret_directory;"
+ ).fetchone()[0]
+ )
+
+ is_default_secrets_directory = (
+ len(secrets_path.parts) >= 2
+ and secrets_path.parts[-1] == "stored_secrets"
+ and secrets_path.parts[-2] == ".duckdb"
+ )
+
+ if is_default_secrets_directory and persistent:
+ logger.warn(
+ "You are persisting duckdb secrets but are storing them in the default folder"
+ f" {secrets_path}. These secrets are saved there unencrypted, we"
+ " recommend using a custom secret directory."
+ )
+
persistent_stmt = ""
if persistent:
persistent_stmt = " PERSISTENT "
@@ -90,6 +113,9 @@ def create_authentication(self, persistent: bool = False, secret_name: str = Non
# add secrets required for creating views
if self.fs_client.config.protocol == "s3":
aws_creds = cast(AwsCredentials, self.fs_client.config.credentials)
+ session_token = (
+ "" if aws_creds.aws_session_token is None else aws_creds.aws_session_token
+ )
endpoint = (
aws_creds.endpoint_url.replace("https://", "")
if aws_creds.endpoint_url
@@ -100,6 +126,7 @@ def create_authentication(self, persistent: bool = False, secret_name: str = Non
TYPE S3,
KEY_ID '{aws_creds.aws_access_key_id}',
SECRET '{aws_creds.aws_secret_access_key}',
+ SESSION_TOKEN '{session_token}',
REGION '{aws_creds.region_name}',
ENDPOINT '{endpoint}',
SCOPE '{scope}'
@@ -166,8 +193,6 @@ def open_connection(self) -> duckdb.DuckDBPyConnection:
if not self.has_dataset():
self.create_dataset()
self._conn.sql(f"USE {self.fully_qualified_dataset_name()}")
-
- # create authentication to data provider
self.create_authentication()
return self._conn
@@ -189,8 +214,10 @@ def create_views_for_tables(self, tables: Dict[str, str]) -> None:
if view_name in existing_tables:
continue
+ # NOTE: if this is staging configuration then `prepare_load_table` will remove some info
+ # from table schema, if we ever extend this to handle staging destination, this needs to change
+ schema_table = self.fs_client.prepare_load_table(table_name)
# discover file type
- schema_table = cast(PreparedTableSchema, self.fs_client.schema.tables[table_name])
folder = self.fs_client.get_table_dir(table_name)
files = self.fs_client.list_table_files(table_name)
first_file_type = os.path.splitext(files[0])[1][1:]
@@ -217,12 +244,8 @@ def create_views_for_tables(self, tables: Dict[str, str]) -> None:
)
)
- # discover wether compression is enabled
- compression = (
- ""
- if dlt.config.get("data_writer.disable_compression")
- else ", compression = 'gzip'"
- )
+ # discover whether compression is enabled
+ compression = "" if is_compression_disabled() else ", compression = 'gzip'"
# dlt tables are never compressed for now...
if table_name in self.fs_client.schema.dlt_table_names():
@@ -236,7 +259,7 @@ def create_views_for_tables(self, tables: Dict[str, str]) -> None:
from_statement = f"read_parquet([{resolved_files_string}])"
elif first_file_type == "jsonl":
from_statement = (
- f"read_json([{resolved_files_string}], columns = {{{columns}}}) {compression}"
+ f"read_json([{resolved_files_string}], columns = {{{columns}}}{compression})"
)
else:
raise NotImplementedError(
diff --git a/dlt/destinations/impl/lancedb/configuration.py b/dlt/destinations/impl/lancedb/configuration.py
index 329132f495..8f6a192bb0 100644
--- a/dlt/destinations/impl/lancedb/configuration.py
+++ b/dlt/destinations/impl/lancedb/configuration.py
@@ -59,6 +59,7 @@ class LanceDBClientOptions(BaseConfiguration):
"sentence-transformers",
"huggingface",
"colbert",
+ "ollama",
]
@@ -92,8 +93,6 @@ class LanceDBClientConfiguration(DestinationClientDwhConfiguration):
Make sure it corresponds with the associated embedding model's dimensionality."""
vector_field_name: str = "vector"
"""Name of the special field to store the vector embeddings."""
- id_field_name: str = "id__"
- """Name of the special field to manage deduplication."""
sentinel_table_name: str = "dltSentinelTable"
"""Name of the sentinel table that encapsulates datasets. Since LanceDB has no
concept of schemas, this table serves as a proxy to group related dlt tables together."""
diff --git a/dlt/destinations/impl/lancedb/factory.py b/dlt/destinations/impl/lancedb/factory.py
index 8ce2217007..d0d22ed3fb 100644
--- a/dlt/destinations/impl/lancedb/factory.py
+++ b/dlt/destinations/impl/lancedb/factory.py
@@ -26,8 +26,8 @@ class lancedb(Destination[LanceDBClientConfiguration, "LanceDBClient"]):
def _raw_capabilities(self) -> DestinationCapabilitiesContext:
caps = DestinationCapabilitiesContext()
- caps.preferred_loader_file_format = "jsonl"
- caps.supported_loader_file_formats = ["jsonl"]
+ caps.preferred_loader_file_format = "parquet"
+ caps.supported_loader_file_formats = ["parquet", "reference"]
caps.type_mapper = LanceDBTypeMapper
caps.max_identifier_length = 200
@@ -42,6 +42,10 @@ def _raw_capabilities(self) -> DestinationCapabilitiesContext:
caps.timestamp_precision = 6
caps.supported_replace_strategies = ["truncate-and-insert"]
+ caps.recommended_file_size = 128_000_000
+
+ caps.supported_merge_strategies = ["upsert"]
+
return caps
@property
diff --git a/dlt/destinations/impl/lancedb/lancedb_adapter.py b/dlt/destinations/impl/lancedb/lancedb_adapter.py
index 99d5ef43c6..4314dd703f 100644
--- a/dlt/destinations/impl/lancedb/lancedb_adapter.py
+++ b/dlt/destinations/impl/lancedb/lancedb_adapter.py
@@ -1,16 +1,20 @@
-from typing import Any
+from typing import Any, Dict
from dlt.common.schema.typing import TColumnNames, TTableSchemaColumns
from dlt.destinations.utils import get_resource_for_adapter
from dlt.extract import DltResource
+from dlt.extract.items import TTableHintTemplate
VECTORIZE_HINT = "x-lancedb-embed"
+NO_REMOVE_ORPHANS_HINT = "x-lancedb-remove-orphans"
def lancedb_adapter(
data: Any,
embed: TColumnNames = None,
+ merge_key: TColumnNames = None,
+ no_remove_orphans: bool = False,
) -> DltResource:
"""Prepares data for the LanceDB destination by specifying which columns should be embedded.
@@ -20,6 +24,10 @@ def lancedb_adapter(
object.
embed (TColumnNames, optional): Specify columns to generate embeddings for.
It can be a single column name as a string, or a list of column names.
+ merge_key (TColumnNames, optional): Specify columns to merge on.
+ It can be a single column name as a string, or a list of column names.
+ no_remove_orphans (bool): Specify whether to remove orphaned records in child
+ tables with no parent records after merges to maintain referential integrity.
Returns:
DltResource: A resource with applied LanceDB-specific hints.
@@ -34,7 +42,8 @@ def lancedb_adapter(
"""
resource = get_resource_for_adapter(data)
- column_hints: TTableSchemaColumns = {}
+ additional_table_hints: Dict[str, TTableHintTemplate[Any]] = {}
+ column_hints: TTableSchemaColumns = None
if embed:
if isinstance(embed, str):
@@ -43,6 +52,7 @@ def lancedb_adapter(
raise ValueError(
"'embed' must be a list of column names or a single column name as a string."
)
+ column_hints = {}
for column_name in embed:
column_hints[column_name] = {
@@ -50,9 +60,16 @@ def lancedb_adapter(
VECTORIZE_HINT: True, # type: ignore[misc]
}
- if not column_hints:
- raise ValueError("A value for 'embed' must be specified.")
+ additional_table_hints[NO_REMOVE_ORPHANS_HINT] = no_remove_orphans
+
+ if column_hints or additional_table_hints or merge_key:
+ resource.apply_hints(
+ merge_key=merge_key, columns=column_hints, additional_table_hints=additional_table_hints
+ )
else:
- resource.apply_hints(columns=column_hints)
+ raise ValueError(
+ "You must must provide at least either the 'embed' or 'merge_key' or 'remove_orphans'"
+ " argument if using the adapter."
+ )
return resource
diff --git a/dlt/destinations/impl/lancedb/lancedb_client.py b/dlt/destinations/impl/lancedb/lancedb_client.py
index 8a347989a0..1a3e1a7d34 100644
--- a/dlt/destinations/impl/lancedb/lancedb_client.py
+++ b/dlt/destinations/impl/lancedb/lancedb_client.py
@@ -1,4 +1,3 @@
-import uuid
from types import TracebackType
from typing import (
List,
@@ -12,15 +11,17 @@
Dict,
Sequence,
TYPE_CHECKING,
+ Set,
)
-from dlt.common.destination.capabilities import DataTypeMapper
import lancedb # type: ignore
+import lancedb.table # type: ignore
import pyarrow as pa
+import pyarrow.parquet as pq
from lancedb import DBConnection
+from lancedb.common import DATA # type: ignore
from lancedb.embeddings import EmbeddingFunctionRegistry, TextEmbeddingFunction # type: ignore
from lancedb.query import LanceQueryBuilder # type: ignore
-from lancedb.table import Table # type: ignore
from numpy import ndarray
from pyarrow import Array, ChunkedArray, ArrowInvalid
@@ -39,53 +40,142 @@
StorageSchemaInfo,
StateInfo,
LoadJob,
+ HasFollowupJobs,
+ FollowupJobRequest,
)
from dlt.common.pendulum import timedelta
from dlt.common.schema import Schema, TSchemaTables
from dlt.common.schema.typing import (
- C_DLT_LOAD_ID,
+ TColumnType,
TTableSchemaColumns,
TWriteDisposition,
+ TColumnSchema,
+ TTableSchema,
)
-from dlt.common.schema.utils import get_columns_names_with_prop
-from dlt.common.storages import FileStorage
-from dlt.common.typing import DictStrAny
+from dlt.common.schema.utils import get_columns_names_with_prop, is_nested_table
+from dlt.common.storages import FileStorage, LoadJobInfo, ParsedLoadJobFileName
from dlt.destinations.impl.lancedb.configuration import (
LanceDBClientConfiguration,
)
from dlt.destinations.impl.lancedb.exceptions import (
lancedb_error,
)
-from dlt.destinations.impl.lancedb.lancedb_adapter import VECTORIZE_HINT
+from dlt.destinations.impl.lancedb.lancedb_adapter import (
+ VECTORIZE_HINT,
+ NO_REMOVE_ORPHANS_HINT,
+)
from dlt.destinations.impl.lancedb.schema import (
make_arrow_field_schema,
make_arrow_table_schema,
TArrowSchema,
NULL_SCHEMA,
TArrowField,
+ arrow_datatype_to_fusion_datatype,
+ TTableLineage,
+ TableJob,
)
from dlt.destinations.impl.lancedb.utils import (
- list_merge_identifiers,
- generate_uuid,
set_non_standard_providers_environment_variables,
+ EMPTY_STRING_PLACEHOLDER,
+ fill_empty_source_column_values_with_placeholder,
+ get_canonical_vector_database_doc_id_merge_key,
+ create_filter_condition,
)
+from dlt.destinations.job_impl import ReferenceFollowupJobRequest
+from dlt.destinations.type_mapping import TypeMapperImpl
if TYPE_CHECKING:
NDArray = ndarray[Any, Any]
else:
NDArray = ndarray
-EMPTY_STRING_PLACEHOLDER = "0uEoDNBpQUBwsxKbmxxB"
+TIMESTAMP_PRECISION_TO_UNIT: Dict[int, str] = {0: "s", 3: "ms", 6: "us", 9: "ns"}
+UNIT_TO_TIMESTAMP_PRECISION: Dict[str, int] = {v: k for k, v in TIMESTAMP_PRECISION_TO_UNIT.items()}
+BATCH_PROCESS_CHUNK_SIZE = 10_000
+
+
+class LanceDBTypeMapper(TypeMapperImpl):
+ sct_to_unbound_dbt = {
+ "text": pa.string(),
+ "double": pa.float64(),
+ "bool": pa.bool_(),
+ "bigint": pa.int64(),
+ "binary": pa.binary(),
+ "date": pa.date32(),
+ "json": pa.string(),
+ }
+
+ sct_to_dbt = {}
+
+ dbt_to_sct = {
+ pa.string(): "text",
+ pa.float64(): "double",
+ pa.bool_(): "bool",
+ pa.int64(): "bigint",
+ pa.binary(): "binary",
+ pa.date32(): "date",
+ }
+
+ def to_db_decimal_type(self, column: TColumnSchema) -> pa.Decimal128Type:
+ precision, scale = self.decimal_precision(column.get("precision"), column.get("scale"))
+ return pa.decimal128(precision, scale)
+
+ def to_db_datetime_type(
+ self,
+ column: TColumnSchema,
+ table: TTableSchema = None,
+ ) -> pa.TimestampType:
+ column_name = column.get("name")
+ timezone = column.get("timezone")
+ precision = column.get("precision")
+ if timezone is not None or precision is not None:
+ logger.warning(
+ "LanceDB does not currently support column flags for timezone or precision."
+ f" These flags were used in column '{column_name}'."
+ )
+ unit: str = TIMESTAMP_PRECISION_TO_UNIT[self.capabilities.timestamp_precision]
+ return pa.timestamp(unit, "UTC")
+
+ def to_db_time_type(self, column: TColumnSchema, table: TTableSchema = None) -> pa.Time64Type:
+ unit: str = TIMESTAMP_PRECISION_TO_UNIT[self.capabilities.timestamp_precision]
+ return pa.time64(unit)
+
+ def from_db_type(
+ self,
+ db_type: pa.DataType,
+ precision: Optional[int] = None,
+ scale: Optional[int] = None,
+ ) -> TColumnType:
+ if isinstance(db_type, pa.TimestampType):
+ return dict(
+ data_type="timestamp",
+ precision=UNIT_TO_TIMESTAMP_PRECISION[db_type.unit],
+ scale=scale,
+ )
+ if isinstance(db_type, pa.Time64Type):
+ return dict(
+ data_type="time",
+ precision=UNIT_TO_TIMESTAMP_PRECISION[db_type.unit],
+ scale=scale,
+ )
+ if isinstance(db_type, pa.Decimal128Type):
+ precision, scale = db_type.precision, db_type.scale
+ if (precision, scale) == self.capabilities.wei_precision:
+ return cast(TColumnType, dict(data_type="wei"))
+ return dict(data_type="decimal", precision=precision, scale=scale)
+ return super().from_db_type(cast(str, db_type), precision, scale) # type: ignore
-def upload_batch(
- records: List[DictStrAny],
+def write_records(
+ records: DATA,
/,
*,
db_client: DBConnection,
table_name: str,
- write_disposition: TWriteDisposition,
- id_field_name: Optional[str] = None,
+ write_disposition: Optional[TWriteDisposition] = "append",
+ merge_key: Optional[str] = None,
+ remove_orphans: Optional[bool] = False,
+ filter_condition: Optional[str] = None,
) -> None:
"""Inserts records into a LanceDB table with automatic embedding computation.
@@ -93,8 +183,11 @@ def upload_batch(
records: The data to be inserted as payload.
db_client: The LanceDB client connection.
table_name: The name of the table to insert into.
- id_field_name: The name of the ID field for update/merge operations.
+ merge_key: Keys for update/merge operations.
write_disposition: The write disposition - one of 'skip', 'append', 'replace', 'merge'.
+ remove_orphans (bool): Whether to remove orphans after insertion or not (only merge disposition).
+ filter_condition (str): If None, then all such rows will be deleted.
+ Otherwise, the condition will be used as an SQL filter to limit what rows are deleted.
Raises:
ValueError: If the write disposition is unsupported, or `id_field_name` is not
@@ -110,16 +203,17 @@ def upload_batch(
) from e
try:
- if write_disposition in ("append", "skip"):
+ if write_disposition in ("append", "skip", "replace"):
tbl.add(records)
- elif write_disposition == "replace":
- tbl.add(records, mode="overwrite")
elif write_disposition == "merge":
- if not id_field_name:
- raise ValueError("To perform a merge update, 'id_field_name' must be specified.")
- tbl.merge_insert(
- id_field_name
- ).when_matched_update_all().when_not_matched_insert_all().execute(records)
+ if remove_orphans:
+ tbl.merge_insert(merge_key).when_not_matched_by_source_delete(
+ filter_condition
+ ).execute(records)
+ else:
+ tbl.merge_insert(
+ merge_key
+ ).when_matched_update_all().when_not_matched_insert_all().execute(records)
else:
raise DestinationTerminalException(
f"Unsupported write disposition {write_disposition} for LanceDB Destination - batch"
@@ -135,6 +229,8 @@ class LanceDBClient(JobClientBase, WithStateSync):
"""LanceDB destination handler."""
model_func: TextEmbeddingFunction
+ """The embedder callback used for each chunk."""
+ dataset_name: str
def __init__(
self,
@@ -152,6 +248,7 @@ def __init__(
self.registry = EmbeddingFunctionRegistry.get_instance()
self.type_mapper = self.capabilities.get_type_mapper()
self.sentinel_table_name = config.sentinel_table_name
+ self.dataset_name = self.config.normalize_dataset_name(self.schema)
embedding_model_provider = self.config.embedding_model_provider
@@ -169,11 +266,6 @@ def __init__(
)
self.vector_field_name = self.config.vector_field_name
- self.id_field_name = self.config.id_field_name
-
- @property
- def dataset_name(self) -> str:
- return self.config.normalize_dataset_name(self.schema)
@property
def sentinel_table(self) -> str:
@@ -187,7 +279,7 @@ def make_qualified_table_name(self, table_name: str) -> str:
)
def get_table_schema(self, table_name: str) -> TArrowSchema:
- schema_table: Table = self.db_client.open_table(table_name)
+ schema_table: "lancedb.table.Table" = self.db_client.open_table(table_name)
schema_table.checkout_latest()
schema = schema_table.schema
return cast(
@@ -196,13 +288,15 @@ def get_table_schema(self, table_name: str) -> TArrowSchema:
)
@lancedb_error
- def create_table(self, table_name: str, schema: TArrowSchema, mode: str = "create") -> Table:
+ def create_table(
+ self, table_name: str, schema: TArrowSchema, mode: str = "create"
+ ) -> "lancedb.table.Table":
"""Create a LanceDB Table from the provided LanceModel or PyArrow schema.
Args:
schema: The table schema to create.
table_name: The name of the table to create.
- mode (): The mode to use when creating the table. Can be either "create" or "overwrite".
+ mode (str): The mode to use when creating the table. Can be either "create" or "overwrite".
By default, if the table already exists, an exception is raised.
If you want to overwrite the table, use mode="overwrite".
"""
@@ -230,7 +324,7 @@ def query_table(
Returns:
A LanceDB query builder.
"""
- query_table: Table = self.db_client.open_table(table_name)
+ query_table: "lancedb.table.Table" = self.db_client.open_table(table_name)
query_table.checkout_latest()
return query_table.search(query=query)
@@ -255,7 +349,7 @@ def drop_storage(self) -> None:
Deletes all tables in the dataset and all data, as well as sentinel table associated with them.
- If the dataset name was not provided, it deletes all the tables in the current schema.
+ If the dataset name wasn't provided, it deletes all the tables in the current schema.
"""
for table_name in self._get_table_names():
self.db_client.drop_table(table_name)
@@ -282,7 +376,22 @@ def initialize_storage(self, truncate_tables: Iterable[str] = None) -> None:
def is_storage_initialized(self) -> bool:
return self.table_exists(self.sentinel_table)
- def _create_sentinel_table(self) -> Table:
+ def verify_schema(
+ self, only_tables: Iterable[str] = None, new_jobs: Iterable[ParsedLoadJobFileName] = None
+ ) -> List[PreparedTableSchema]:
+ loaded_tables = super().verify_schema(only_tables, new_jobs)
+ # verify merge keys early
+ for load_table in loaded_tables:
+ if not is_nested_table(load_table) and not load_table.get(NO_REMOVE_ORPHANS_HINT):
+ if merge_key := get_columns_names_with_prop(load_table, "merge_key"):
+ if len(merge_key) > 1:
+ raise DestinationTerminalException(
+ "You cannot specify multiple merge keys with LanceDB orphan remove"
+ f" enabled: {merge_key}"
+ )
+ return loaded_tables
+
+ def _create_sentinel_table(self) -> "lancedb.table.Table":
"""Create an empty table to indicate that the storage is initialized."""
return self.create_table(schema=NULL_SCHEMA, table_name=self.sentinel_table)
@@ -310,7 +419,7 @@ def update_stored_schema(
# TODO: return a real updated table schema (like in SQL job client)
self._execute_schema_update(only_tables)
else:
- logger.info(
+ logger.debug(
f"Schema with hash {self.schema.stored_version_hash} "
f"inserted at {schema_info.inserted_at} found "
"in storage, no upgrade required"
@@ -325,7 +434,7 @@ def get_storage_table(self, table_name: str) -> Tuple[bool, TTableSchemaColumns]
try:
fq_table_name = self.make_qualified_table_name(table_name)
- table: Table = self.db_client.open_table(fq_table_name)
+ table: "lancedb.table.Table" = self.db_client.open_table(fq_table_name)
table.checkout_latest()
arrow_schema: TArrowSchema = table.schema
except FileNotFoundError:
@@ -341,34 +450,33 @@ def get_storage_table(self, table_name: str) -> Tuple[bool, TTableSchemaColumns]
return True, table_schema
@lancedb_error
- def add_table_fields(
- self, table_name: str, field_schemas: List[TArrowField]
- ) -> Optional[Table]:
- """Add multiple fields to the LanceDB table at once.
+ def extend_lancedb_table_schema(self, table_name: str, field_schemas: List[pa.Field]) -> None:
+ """Extend LanceDB table schema with empty columns.
Args:
- table_name: The name of the table to create the fields on.
- field_schemas: The list of fields to create.
+ table_name: The name of the table to create the fields on.
+ field_schemas: The list of PyArrow Fields to create in the target LanceDB table.
"""
- table: Table = self.db_client.open_table(table_name)
+ table: "lancedb.table.Table" = self.db_client.open_table(table_name)
table.checkout_latest()
- arrow_table = table.to_arrow()
-
- # Check if any of the new fields already exist in the table.
- existing_fields = set(arrow_table.schema.names)
- new_fields = [field for field in field_schemas if field.name not in existing_fields]
- if not new_fields:
- # All fields already present, skip.
- return None
+ try:
+ # Use DataFusion SQL syntax to alter fields without loading data into client memory.
+ # Now, the most efficient way to modify column values is in LanceDB.
+ new_fields = {
+ field.name: f"CAST(NULL AS {arrow_datatype_to_fusion_datatype(field.type)})"
+ for field in field_schemas
+ }
+ table.add_columns(new_fields)
- null_arrays = [pa.nulls(len(arrow_table), type=field.type) for field in new_fields]
+ # Make new columns nullable in the Arrow schema.
+ # Necessary because the Datafusion SQL API doesn't set new columns as nullable by default.
+ for field in field_schemas:
+ table.alter_columns({"path": field.name, "nullable": field.nullable})
- for field, null_array in zip(new_fields, null_arrays):
- arrow_table = arrow_table.append_column(field, null_array)
+ # TODO: Update method below doesn't work for bulk NULL assignments, raise with LanceDB developers.
+ # table.update(values={field.name: None})
- try:
- return self.db_client.create_table(table_name, arrow_table, mode="overwrite")
except OSError:
# Error occurred while creating the table, skip.
return None
@@ -376,36 +484,31 @@ def add_table_fields(
def _execute_schema_update(self, only_tables: Iterable[str]) -> None:
for table_name in only_tables or self.schema.tables:
exists, existing_columns = self.get_storage_table(table_name)
- new_columns = self.schema.get_new_table_columns(
+ new_columns: List[TColumnSchema] = self.schema.get_new_table_columns(
table_name,
existing_columns,
self.capabilities.generates_case_sensitive_identifiers(),
)
- embedding_fields: List[str] = get_columns_names_with_prop(
- self.schema.get_table(table_name), VECTORIZE_HINT
- )
logger.info(f"Found {len(new_columns)} updates for {table_name} in {self.schema.name}")
- if len(new_columns) > 0:
+ if new_columns:
if exists:
field_schemas: List[TArrowField] = [
make_arrow_field_schema(column["name"], column, self.type_mapper)
for column in new_columns
]
fq_table_name = self.make_qualified_table_name(table_name)
- self.add_table_fields(fq_table_name, field_schemas)
+ self.extend_lancedb_table_schema(fq_table_name, field_schemas)
else:
if table_name not in self.schema.dlt_table_names():
embedding_fields = get_columns_names_with_prop(
self.schema.get_table(table_name=table_name), VECTORIZE_HINT
)
vector_field_name = self.vector_field_name
- id_field_name = self.id_field_name
embedding_model_func = self.model_func
embedding_model_dimensions = self.config.embedding_model_dimensions
else:
embedding_fields = None
vector_field_name = None
- id_field_name = None
embedding_model_func = None
embedding_model_dimensions = None
@@ -417,7 +520,6 @@ def _execute_schema_update(self, only_tables: Iterable[str]) -> None:
embedding_model_func=embedding_model_func,
embedding_model_dimensions=embedding_model_dimensions,
vector_field_name=vector_field_name,
- id_field_name=id_field_name,
)
fq_table_name = self.make_qualified_table_name(table_name)
self.create_table(fq_table_name, table_schema)
@@ -446,7 +548,8 @@ def update_schema_in_storage(self) -> None:
write_disposition = self.schema.get_table(self.schema.version_table_name).get(
"write_disposition"
)
- upload_batch(
+
+ write_records(
records,
db_client=self.db_client,
table_name=fq_version_table_name,
@@ -459,15 +562,17 @@ def get_stored_state(self, pipeline_name: str) -> Optional[StateInfo]:
fq_state_table_name = self.make_qualified_table_name(self.schema.state_table_name)
fq_loads_table_name = self.make_qualified_table_name(self.schema.loads_table_name)
- state_table_: Table = self.db_client.open_table(fq_state_table_name)
+ state_table_: "lancedb.table.Table" = self.db_client.open_table(fq_state_table_name)
state_table_.checkout_latest()
- loads_table_: Table = self.db_client.open_table(fq_loads_table_name)
+ loads_table_: "lancedb.table.Table" = self.db_client.open_table(fq_loads_table_name)
loads_table_.checkout_latest()
# normalize property names
p_load_id = self.schema.naming.normalize_identifier("load_id")
- p_dlt_load_id = self.schema.naming.normalize_identifier(C_DLT_LOAD_ID)
+ p_dlt_load_id = self.schema.naming.normalize_identifier(
+ self.schema.data_item_normalizer.c_dlt_load_id # type: ignore[attr-defined]
+ )
p_pipeline_name = self.schema.naming.normalize_identifier("pipeline_name")
p_status = self.schema.naming.normalize_identifier("status")
p_version = self.schema.naming.normalize_identifier("version")
@@ -476,7 +581,7 @@ def get_stored_state(self, pipeline_name: str) -> Optional[StateInfo]:
p_created_at = self.schema.naming.normalize_identifier("created_at")
p_version_hash = self.schema.naming.normalize_identifier("version_hash")
- # Read the tables into memory as Arrow tables, with pushdown predicates, so we pull as less
+ # Read the tables into memory as Arrow tables, with pushdown predicates, so we pull as little
# data into memory as possible.
state_table = (
state_table_.search()
@@ -508,7 +613,7 @@ def get_stored_state(self, pipeline_name: str) -> Optional[StateInfo]:
def get_stored_schema_by_hash(self, schema_hash: str) -> Optional[StorageSchemaInfo]:
fq_version_table_name = self.make_qualified_table_name(self.schema.version_table_name)
- version_table: Table = self.db_client.open_table(fq_version_table_name)
+ version_table: "lancedb.table.Table" = self.db_client.open_table(fq_version_table_name)
version_table.checkout_latest()
p_version_hash = self.schema.naming.normalize_identifier("version_hash")
p_inserted_at = self.schema.naming.normalize_identifier("inserted_at")
@@ -524,8 +629,6 @@ def get_stored_schema_by_hash(self, schema_hash: str) -> Optional[StorageSchemaI
)
).to_list()
- # LanceDB's ORDER BY clause doesn't seem to work.
- # See https://github.com/dlt-hub/dlt/pull/1375#issuecomment-2171909341
most_recent_schema = sorted(schemas, key=lambda x: x[p_inserted_at], reverse=True)[0]
return StorageSchemaInfo(
version_hash=most_recent_schema[p_version_hash],
@@ -543,7 +646,7 @@ def get_stored_schema(self, schema_name: str = None) -> Optional[StorageSchemaIn
"""Retrieves newest schema from destination storage."""
fq_version_table_name = self.make_qualified_table_name(self.schema.version_table_name)
- version_table: Table = self.db_client.open_table(fq_version_table_name)
+ version_table: "lancedb.table.Table" = self.db_client.open_table(fq_version_table_name)
version_table.checkout_latest()
p_version_hash = self.schema.naming.normalize_identifier("version_hash")
p_inserted_at = self.schema.naming.normalize_identifier("inserted_at")
@@ -558,8 +661,6 @@ def get_stored_schema(self, schema_name: str = None) -> Optional[StorageSchemaIn
query = query.where(f'`{p_schema_name}` = "{schema_name}"', prefilter=True)
schemas = query.to_list()
- # LanceDB's ORDER BY clause doesn't seem to work.
- # See https://github.com/dlt-hub/dlt/pull/1375#issuecomment-2171909341
most_recent_schema = sorted(schemas, key=lambda x: x[p_inserted_at], reverse=True)[0]
return StorageSchemaInfo(
version_hash=most_recent_schema[p_version_hash],
@@ -591,16 +692,14 @@ def complete_load(self, load_id: str) -> None:
self.schema.naming.normalize_identifier("schema_name"): self.schema.name,
self.schema.naming.normalize_identifier("status"): 0,
self.schema.naming.normalize_identifier("inserted_at"): str(pendulum.now()),
- self.schema.naming.normalize_identifier(
- "schema_version_hash"
- ): None, # Payload schema must match the target schema.
+ self.schema.naming.normalize_identifier("schema_version_hash"): None,
}
]
fq_loads_table_name = self.make_qualified_table_name(self.schema.loads_table_name)
write_disposition = self.schema.get_table(self.schema.loads_table_name).get(
"write_disposition"
)
- upload_batch(
+ write_records(
records,
db_client=self.db_client,
table_name=fq_loads_table_name,
@@ -610,80 +709,152 @@ def complete_load(self, load_id: str) -> None:
def create_load_job(
self, table: PreparedTableSchema, file_path: str, load_id: str, restore: bool = False
) -> LoadJob:
- return LanceDBLoadJob(
- file_path=file_path,
- type_mapper=self.type_mapper,
- model_func=self.model_func,
- fq_table_name=self.make_qualified_table_name(table["name"]),
+ if ReferenceFollowupJobRequest.is_reference_job(file_path):
+ return LanceDBRemoveOrphansJob(file_path)
+ else:
+ return LanceDBLoadJob(file_path, table)
+
+ def create_table_chain_completed_followup_jobs(
+ self,
+ table_chain: Sequence[TTableSchema],
+ completed_table_chain_jobs: Optional[Sequence[LoadJobInfo]] = None,
+ ) -> List[FollowupJobRequest]:
+ jobs = super().create_table_chain_completed_followup_jobs(
+ table_chain, completed_table_chain_jobs # type: ignore[arg-type]
)
+ # Orphan removal is only supported for upsert strategy because we need a deterministic key hash.
+ first_table_in_chain = table_chain[0]
+ if first_table_in_chain.get(
+ "write_disposition"
+ ) == "merge" and not first_table_in_chain.get(NO_REMOVE_ORPHANS_HINT):
+ all_job_paths_ordered = [
+ job.file_path
+ for table in table_chain
+ for job in completed_table_chain_jobs
+ if job.job_file_info.table_name == table.get("name")
+ ]
+ root_table_file_name = FileStorage.get_file_name_from_file_path(
+ all_job_paths_ordered[0]
+ )
+ jobs.append(ReferenceFollowupJobRequest(root_table_file_name, all_job_paths_ordered))
+ return jobs
def table_exists(self, table_name: str) -> bool:
return table_name in self.db_client.table_names()
-class LanceDBLoadJob(RunnableLoadJob):
+class LanceDBLoadJob(RunnableLoadJob, HasFollowupJobs):
arrow_schema: TArrowSchema
def __init__(
self,
file_path: str,
- type_mapper: DataTypeMapper,
- model_func: TextEmbeddingFunction,
- fq_table_name: str,
+ table_schema: TTableSchema,
) -> None:
super().__init__(file_path)
- self._type_mapper = type_mapper
- self._fq_table_name: str = fq_table_name
- self._model_func = model_func
self._job_client: "LanceDBClient" = None
+ self._table_schema: TTableSchema = table_schema
def run(self) -> None:
- self._db_client: DBConnection = self._job_client.db_client
- self._embedding_model_func: TextEmbeddingFunction = self._model_func
- self._embedding_model_dimensions: int = self._job_client.config.embedding_model_dimensions
- self._id_field_name: str = self._job_client.config.id_field_name
-
- unique_identifiers: Sequence[str] = list_merge_identifiers(self._load_table)
+ db_client: DBConnection = self._job_client.db_client
+ fq_table_name: str = self._job_client.make_qualified_table_name(self._table_schema["name"])
write_disposition: TWriteDisposition = cast(
TWriteDisposition, self._load_table.get("write_disposition", "append")
)
- with FileStorage.open_zipsafe_ro(self._file_path) as f:
- records: List[DictStrAny] = [json.loads(line) for line in f]
+ with FileStorage.open_zipsafe_ro(self._file_path, mode="rb") as f:
+ arrow_table: pa.Table = pq.read_table(f)
# Replace empty strings with placeholder string if OpenAI is used.
# https://github.com/lancedb/lancedb/issues/1577#issuecomment-2318104218.
if (self._job_client.config.embedding_model_provider == "openai") and (
source_columns := get_columns_names_with_prop(self._load_table, VECTORIZE_HINT)
):
- records = [
- {
- k: EMPTY_STRING_PLACEHOLDER if k in source_columns and v in ("", None) else v
- for k, v in record.items()
- }
- for record in records
- ]
-
- if self._load_table not in self._schema.dlt_tables():
- for record in records:
- # Add reserved ID fields.
- uuid_id = (
- generate_uuid(record, unique_identifiers, self._fq_table_name)
- if unique_identifiers
- else str(uuid.uuid4())
- )
- record.update({self._id_field_name: uuid_id})
+ arrow_table = fill_empty_source_column_values_with_placeholder(
+ arrow_table, source_columns, EMPTY_STRING_PLACEHOLDER
+ )
- # LanceDB expects all fields in the target arrow table to be present in the data payload.
- # We add and set these missing fields, that are fields not present in the target schema, to NULL.
- missing_fields = set(self._load_table["columns"]) - set(record)
- for field in missing_fields:
- record[field] = None
+ # We need upsert merge's deterministic _dlt_id to perform orphan removal.
+ # Hence, we require at least a primary key on the root table if the merge disposition is chosen.
+ if (
+ (self._load_table not in self._schema.dlt_table_names())
+ and not is_nested_table(self._load_table) # Is root table.
+ and (write_disposition == "merge")
+ and (not get_columns_names_with_prop(self._load_table, "primary_key"))
+ ):
+ raise DestinationTerminalException(
+ "LanceDB's write disposition requires at least one explicit primary key."
+ )
- upload_batch(
- records,
- db_client=self._db_client,
- table_name=self._fq_table_name,
+ dlt_id = self._schema.naming.normalize_identifier(
+ self._schema.data_item_normalizer.c_dlt_id # type: ignore[attr-defined]
+ )
+ write_records(
+ arrow_table,
+ db_client=db_client,
+ table_name=fq_table_name,
write_disposition=write_disposition,
- id_field_name=self._id_field_name,
+ merge_key=dlt_id,
)
+
+
+class LanceDBRemoveOrphansJob(RunnableLoadJob):
+ orphaned_ids: Set[str]
+
+ def __init__(
+ self,
+ file_path: str,
+ ) -> None:
+ super().__init__(file_path)
+ self._job_client: "LanceDBClient" = None
+ self.references = ReferenceFollowupJobRequest.resolve_references(file_path)
+
+ def run(self) -> None:
+ dlt_load_id = self._schema.data_item_normalizer.c_dlt_load_id # type: ignore[attr-defined]
+ dlt_id = self._schema.data_item_normalizer.c_dlt_id # type: ignore[attr-defined]
+ dlt_root_id = self._schema.data_item_normalizer.c_dlt_root_id # type: ignore[attr-defined]
+
+ db_client: DBConnection = self._job_client.db_client
+ table_lineage: TTableLineage = [
+ TableJob(
+ table_schema=self._schema.get_table(
+ ParsedLoadJobFileName.parse(file_path_).table_name
+ ),
+ table_name=ParsedLoadJobFileName.parse(file_path_).table_name,
+ file_path=file_path_,
+ )
+ for file_path_ in self.references
+ ]
+
+ for job in table_lineage:
+ target_is_root_table = not is_nested_table(job.table_schema)
+ fq_table_name = self._job_client.make_qualified_table_name(job.table_name)
+ file_path = job.file_path
+ with FileStorage.open_zipsafe_ro(file_path, mode="rb") as f:
+ payload_arrow_table: pa.Table = pq.read_table(f)
+
+ if target_is_root_table:
+ canonical_doc_id_field = get_canonical_vector_database_doc_id_merge_key(
+ job.table_schema
+ )
+ filter_condition = create_filter_condition(
+ canonical_doc_id_field, payload_arrow_table[canonical_doc_id_field]
+ )
+ merge_key = dlt_load_id
+
+ else:
+ filter_condition = create_filter_condition(
+ dlt_root_id,
+ payload_arrow_table[dlt_root_id],
+ )
+ merge_key = dlt_id
+
+ write_records(
+ payload_arrow_table,
+ db_client=db_client,
+ table_name=fq_table_name,
+ write_disposition="merge",
+ merge_key=merge_key,
+ remove_orphans=True,
+ filter_condition=filter_condition,
+ )
diff --git a/dlt/destinations/impl/lancedb/schema.py b/dlt/destinations/impl/lancedb/schema.py
index 27c6fb33a1..25dfbc840a 100644
--- a/dlt/destinations/impl/lancedb/schema.py
+++ b/dlt/destinations/impl/lancedb/schema.py
@@ -1,6 +1,5 @@
"""Utilities for creating arrow schemas from table schemas."""
-
-from dlt.common.json import json
+from collections import namedtuple
from typing import (
List,
cast,
@@ -11,17 +10,19 @@
from lancedb.embeddings import TextEmbeddingFunction # type: ignore
from typing_extensions import TypeAlias
+from dlt.common.destination.capabilities import DataTypeMapper
+from dlt.common.json import json
from dlt.common.schema import Schema, TColumnSchema
from dlt.common.typing import DictStrAny
-from dlt.common.destination.capabilities import DataTypeMapper
-
TArrowSchema: TypeAlias = pa.Schema
TArrowDataType: TypeAlias = pa.DataType
TArrowField: TypeAlias = pa.Field
NULL_SCHEMA: TArrowSchema = pa.schema([])
"""Empty pyarrow Schema with no fields."""
+TableJob = namedtuple("TableJob", ["table_schema", "table_name", "file_path"])
+TTableLineage: TypeAlias = List[TableJob]
def arrow_schema_to_dict(schema: TArrowSchema) -> DictStrAny:
@@ -42,7 +43,6 @@ def make_arrow_table_schema(
table_name: str,
schema: Schema,
type_mapper: DataTypeMapper,
- id_field_name: Optional[str] = None,
vector_field_name: Optional[str] = None,
embedding_fields: Optional[List[str]] = None,
embedding_model_func: Optional[TextEmbeddingFunction] = None,
@@ -51,9 +51,6 @@ def make_arrow_table_schema(
"""Creates a PyArrow schema from a dlt schema."""
arrow_schema: List[TArrowField] = []
- if id_field_name:
- arrow_schema.append(pa.field(id_field_name, pa.string()))
-
if embedding_fields:
# User's provided dimension config, if provided, takes precedence.
vec_size = embedding_model_dimensions or embedding_model_func.ndims()
@@ -83,3 +80,22 @@ def make_arrow_table_schema(
metadata["embedding_functions"] = json.dumps(embedding_functions).encode("utf-8")
return pa.schema(arrow_schema, metadata=metadata)
+
+
+def arrow_datatype_to_fusion_datatype(arrow_type: TArrowSchema) -> str:
+ type_map = {
+ pa.bool_(): "BOOLEAN",
+ pa.int64(): "BIGINT",
+ pa.float64(): "DOUBLE",
+ pa.utf8(): "STRING",
+ pa.binary(): "BYTEA",
+ pa.date32(): "DATE",
+ }
+
+ if isinstance(arrow_type, pa.Decimal128Type):
+ return f"DECIMAL({arrow_type.precision}, {arrow_type.scale})"
+
+ if isinstance(arrow_type, pa.TimestampType):
+ return "TIMESTAMP"
+
+ return type_map.get(arrow_type, "UNKNOWN")
diff --git a/dlt/destinations/impl/lancedb/utils.py b/dlt/destinations/impl/lancedb/utils.py
index aeacd4d34b..56991b090f 100644
--- a/dlt/destinations/impl/lancedb/utils.py
+++ b/dlt/destinations/impl/lancedb/utils.py
@@ -1,13 +1,16 @@
import os
-import uuid
-from typing import Sequence, Union, Dict
+from typing import Union, Dict, List
+import pyarrow as pa
+
+from dlt.common import logger
+from dlt.common.data_writers.escape import escape_lancedb_literal
+from dlt.common.destination.exceptions import DestinationTerminalException
from dlt.common.schema import TTableSchema
-from dlt.common.schema.utils import get_columns_names_with_prop
-from dlt.common.typing import DictStrAny
+from dlt.common.schema.utils import get_columns_names_with_prop, get_first_column_name_with_prop
from dlt.destinations.impl.lancedb.configuration import TEmbeddingProvider
-
+EMPTY_STRING_PLACEHOLDER = "0uEoDNBpQUBwsxKbmxxB"
PROVIDER_ENVIRONMENT_VARIABLES_MAP: Dict[TEmbeddingProvider, str] = {
"cohere": "COHERE_API_KEY",
"gemini-text": "GOOGLE_API_KEY",
@@ -16,40 +19,55 @@
}
-def generate_uuid(data: DictStrAny, unique_identifiers: Sequence[str], table_name: str) -> str:
- """Generates deterministic UUID - used for deduplication.
+def set_non_standard_providers_environment_variables(
+ embedding_model_provider: TEmbeddingProvider, api_key: Union[str, None]
+) -> None:
+ if embedding_model_provider in PROVIDER_ENVIRONMENT_VARIABLES_MAP:
+ os.environ[PROVIDER_ENVIRONMENT_VARIABLES_MAP[embedding_model_provider]] = api_key or ""
- Args:
- data (Dict[str, Any]): Arbitrary data to generate UUID for.
- unique_identifiers (Sequence[str]): A list of unique identifiers.
- table_name (str): LanceDB table name.
- Returns:
- str: A string representation of the generated UUID.
- """
- data_id = "_".join(str(data[key]) for key in unique_identifiers)
- return str(uuid.uuid5(uuid.NAMESPACE_DNS, table_name + data_id))
+def get_canonical_vector_database_doc_id_merge_key(
+ load_table: TTableSchema,
+) -> str:
+ if merge_key := get_first_column_name_with_prop(load_table, "merge_key"):
+ return merge_key
+ elif primary_key := get_columns_names_with_prop(load_table, "primary_key"):
+ # No merge key defined, warn and assume the first element of the primary key is `doc_id`.
+ logger.warning(
+ "Merge strategy selected without defined merge key - using the first element of the"
+ f" primary key ({primary_key}) as merge key."
+ )
+ return primary_key[0]
+ else:
+ raise DestinationTerminalException(
+ "You must specify at least a primary key in order to perform orphan removal."
+ )
-def list_merge_identifiers(table_schema: TTableSchema) -> Sequence[str]:
- """Returns a list of merge keys for a table used for either merging or deduplication.
+def fill_empty_source_column_values_with_placeholder(
+ table: pa.Table, source_columns: List[str], placeholder: str
+) -> pa.Table:
+ """
+ Replaces empty strings and null values in the specified source columns of an Arrow table with a placeholder string.
Args:
- table_schema (TTableSchema): a dlt table schema.
+ table (pa.Table): The input Arrow table.
+ source_columns (List[str]): A list of column names to replace empty strings and null values in.
+ placeholder (str): The placeholder string to use for replacement.
Returns:
- Sequence[str]: A list of unique column identifiers.
+ pa.Table: The modified Arrow table with empty strings and null values replaced in the specified columns.
"""
- if table_schema.get("write_disposition") == "merge":
- primary_keys = get_columns_names_with_prop(table_schema, "primary_key")
- merge_keys = get_columns_names_with_prop(table_schema, "merge_key")
- if join_keys := list(set(primary_keys + merge_keys)):
- return join_keys
- return get_columns_names_with_prop(table_schema, "unique")
+ for col_name in source_columns:
+ column = table[col_name]
+ filled_column = pa.compute.fill_null(column, fill_value=placeholder)
+ new_column = pa.compute.replace_substring_regex(
+ filled_column, pattern=r"^$", replacement=placeholder
+ )
+ table = table.set_column(table.column_names.index(col_name), col_name, new_column)
+ return table
-def set_non_standard_providers_environment_variables(
- embedding_model_provider: TEmbeddingProvider, api_key: Union[str, None]
-) -> None:
- if embedding_model_provider in PROVIDER_ENVIRONMENT_VARIABLES_MAP:
- os.environ[PROVIDER_ENVIRONMENT_VARIABLES_MAP[embedding_model_provider]] = api_key or ""
+def create_filter_condition(field_name: str, array: pa.Array) -> str:
+ array_py = array.to_pylist()
+ return f"{field_name} IN ({', '.join(map(escape_lancedb_literal, array_py))})"
diff --git a/dlt/destinations/impl/mssql/sql_client.py b/dlt/destinations/impl/mssql/sql_client.py
index 6ec2beb95e..9f05b88bb5 100644
--- a/dlt/destinations/impl/mssql/sql_client.py
+++ b/dlt/destinations/impl/mssql/sql_client.py
@@ -6,7 +6,7 @@
import pyodbc
from contextlib import contextmanager
-from typing import Any, AnyStr, ClassVar, Iterator, Optional, Sequence
+from typing import Any, AnyStr, ClassVar, Iterator, Optional, Sequence, Tuple
from dlt.destinations.exceptions import (
DatabaseTerminalException,
@@ -183,3 +183,6 @@ def _make_database_exception(cls, ex: Exception) -> Exception:
@staticmethod
def is_dbapi_exception(ex: Exception) -> bool:
return isinstance(ex, pyodbc.Error)
+
+ def _limit_clause_sql(self, limit: int) -> Tuple[str, str]:
+ return f"TOP ({limit})", ""
diff --git a/dlt/destinations/impl/qdrant/qdrant_adapter.py b/dlt/destinations/impl/qdrant/qdrant_adapter.py
index abe301fff0..bbc2d719a8 100644
--- a/dlt/destinations/impl/qdrant/qdrant_adapter.py
+++ b/dlt/destinations/impl/qdrant/qdrant_adapter.py
@@ -34,7 +34,7 @@ def qdrant_adapter(
"""
resource = get_resource_for_adapter(data)
- column_hints: TTableSchemaColumns = {}
+ column_hints: TTableSchemaColumns = None
if embed:
if isinstance(embed, str):
@@ -44,6 +44,7 @@ def qdrant_adapter(
"embed must be a list of column names or a single column name as a string"
)
+ column_hints = {}
for column_name in embed:
column_hints[column_name] = {
"name": column_name,
diff --git a/dlt/destinations/impl/redshift/redshift.py b/dlt/destinations/impl/redshift/redshift.py
index 6581889296..2335166761 100644
--- a/dlt/destinations/impl/redshift/redshift.py
+++ b/dlt/destinations/impl/redshift/redshift.py
@@ -1,6 +1,8 @@
import platform
import os
+from dlt.destinations.utils import is_compression_disabled
+
if platform.python_implementation() == "PyPy":
import psycopg2cffi as psycopg2
@@ -10,8 +12,7 @@
# from psycopg2.sql import SQL, Composed
-from typing import Dict, List, Optional, Sequence, Any, Tuple
-
+from typing import Dict, List, Optional, Sequence
from dlt.common.destination.reference import (
FollowupJobRequest,
@@ -93,7 +94,7 @@ def run(self) -> None:
if ext == "jsonl":
file_type = "FORMAT AS JSON 'auto'"
dateformat = "dateformat 'auto' timeformat 'auto'"
- compression = "GZIP"
+ compression = "" if is_compression_disabled() else "GZIP"
elif ext == "parquet":
file_type = "PARQUET"
# if table contains json types then SUPER field will be used.
diff --git a/dlt/destinations/impl/sqlalchemy/db_api_client.py b/dlt/destinations/impl/sqlalchemy/db_api_client.py
index a407e53d70..6f3ff065bf 100644
--- a/dlt/destinations/impl/sqlalchemy/db_api_client.py
+++ b/dlt/destinations/impl/sqlalchemy/db_api_client.py
@@ -80,7 +80,7 @@ def __init__(self, curr: sa.engine.CursorResult) -> None:
self.fetchone = curr.fetchone # type: ignore[assignment]
self.fetchmany = curr.fetchmany # type: ignore[assignment]
- self.set_default_schema_columns()
+ self._set_default_schema_columns()
def _get_columns(self) -> List[str]:
try:
diff --git a/dlt/destinations/impl/weaviate/weaviate_adapter.py b/dlt/destinations/impl/weaviate/weaviate_adapter.py
index 9bd0b41783..0ca9047528 100644
--- a/dlt/destinations/impl/weaviate/weaviate_adapter.py
+++ b/dlt/destinations/impl/weaviate/weaviate_adapter.py
@@ -87,6 +87,7 @@ def weaviate_adapter(
TOKENIZATION_HINT: method, # type: ignore
}
+ # this makes sure that {} as column_hints never gets into apply_hints (that would reset existing columns)
if not column_hints:
raise ValueError("Either 'vectorize' or 'tokenization' must be specified.")
else:
diff --git a/dlt/destinations/job_client_impl.py b/dlt/destinations/job_client_impl.py
index ae51663d37..e02cdbaa09 100644
--- a/dlt/destinations/job_client_impl.py
+++ b/dlt/destinations/job_client_impl.py
@@ -169,6 +169,9 @@ def sql_client(self, client: SqlClientBase[TNativeConn]) -> None:
def drop_storage(self) -> None:
self.sql_client.drop_dataset()
+ with contextlib.suppress(DatabaseUndefinedRelation):
+ with self.sql_client.with_staging_dataset():
+ self.sql_client.drop_dataset()
def initialize_storage(self, truncate_tables: Iterable[str] = None) -> None:
if not self.is_storage_initialized():
diff --git a/dlt/destinations/sql_client.py b/dlt/destinations/sql_client.py
index 51f3211f1b..afd4f82f6c 100644
--- a/dlt/destinations/sql_client.py
+++ b/dlt/destinations/sql_client.py
@@ -70,8 +70,6 @@ def __init__(
staging_dataset_name: str,
capabilities: DestinationCapabilitiesContext,
) -> None:
- if not dataset_name:
- raise ValueError(dataset_name)
self.dataset_name = dataset_name
self.staging_dataset_name = staging_dataset_name
self.database_name = database_name
@@ -260,8 +258,14 @@ def with_alternative_dataset_name(
self.dataset_name = current_dataset_name
def with_staging_dataset(self) -> ContextManager["SqlClientBase[TNativeConn]"]:
+ """Temporarily switch sql client to staging dataset name"""
return self.with_alternative_dataset_name(self.staging_dataset_name)
+ @property
+ def is_staging_dataset_active(self) -> bool:
+ """Checks if staging dataset is currently active"""
+ return self.dataset_name == self.staging_dataset_name
+
def set_query_tags(self, tags: TJobQueryTags) -> None:
"""Sets current schema (source), resource, load_id and table name when a job starts"""
self._query_tags = tags
@@ -302,6 +306,9 @@ def _truncate_table_sql(self, qualified_table_name: str) -> str:
else:
return f"DELETE FROM {qualified_table_name} WHERE 1=1;"
+ def _limit_clause_sql(self, limit: int) -> Tuple[str, str]:
+ return "", f"LIMIT {limit}"
+
class WithSqlClient(JobClientBase):
@property
@@ -329,7 +336,7 @@ def __init__(self, curr: DBApiCursor) -> None:
self.fetchmany = curr.fetchmany # type: ignore
self.fetchone = curr.fetchone # type: ignore
- self.set_default_schema_columns()
+ self._set_default_schema_columns()
def __getattr__(self, name: str) -> Any:
return getattr(self.native_cursor, name)
@@ -339,8 +346,8 @@ def _get_columns(self) -> List[str]:
return [c[0] for c in self.native_cursor.description]
return []
- def set_default_schema_columns(self) -> None:
- self.schema_columns = cast(
+ def _set_default_schema_columns(self) -> None:
+ self.columns_schema = cast(
TTableSchemaColumns, {c: {"name": c, "nullable": True} for c in self._get_columns()}
)
@@ -394,11 +401,11 @@ def iter_arrow(self, chunk_size: int) -> Generator[ArrowTable, None, None]:
if not chunk_size:
result = self.fetchall()
- yield row_tuples_to_arrow(result, caps, self.schema_columns, tz="UTC")
+ yield row_tuples_to_arrow(result, caps, self.columns_schema, tz="UTC")
return
for result in self.iter_fetch(chunk_size=chunk_size):
- yield row_tuples_to_arrow(result, caps, self.schema_columns, tz="UTC")
+ yield row_tuples_to_arrow(result, caps, self.columns_schema, tz="UTC")
def raise_database_error(f: TFun) -> TFun:
diff --git a/dlt/destinations/sql_jobs.py b/dlt/destinations/sql_jobs.py
index ae27213a7c..a389c13170 100644
--- a/dlt/destinations/sql_jobs.py
+++ b/dlt/destinations/sql_jobs.py
@@ -78,7 +78,6 @@ def from_table_chain(
job = cls(file_info.file_name())
job._save_text_file("\n".join(sql))
except Exception as e:
- # raise exception with some context
raise SqlJobCreationException(e, table_chain) from e
return job
@@ -168,12 +167,24 @@ def generate_sql(
merge_strategy = resolve_merge_strategy(
{root_table["name"]: root_table}, root_table, sql_client.capabilities
)
+
+ merge_sql = None
if merge_strategy == "delete-insert":
- return cls.gen_merge_sql(table_chain, sql_client)
+ merge_sql = cls.gen_merge_sql(table_chain, sql_client)
elif merge_strategy == "upsert":
- return cls.gen_upsert_sql(table_chain, sql_client)
+ merge_sql = cls.gen_upsert_sql(table_chain, sql_client)
elif merge_strategy == "scd2":
- return cls.gen_scd2_sql(table_chain, sql_client)
+ merge_sql = cls.gen_scd2_sql(table_chain, sql_client)
+
+ # prepend setup code
+ return cls._gen_table_setup_clauses(table_chain, sql_client) + merge_sql
+
+ @classmethod
+ def _gen_table_setup_clauses(
+ cls, table_chain: Sequence[PreparedTableSchema], sql_client: SqlClientBase[Any]
+ ) -> List[str]:
+ """Subclasses may override this method to generate additional sql statements to run before the merge"""
+ return []
@classmethod
def _gen_key_table_clauses(
@@ -420,23 +431,43 @@ def _get_row_key_col(
sql_client: SqlClientBase[Any],
table: PreparedTableSchema,
) -> str:
- """Returns name of first column in `table` with `row_key` property. If not found first `unique` hint will be used
+ """Returns name of first column in `table` with `row_key` property. If not found first `unique` hint will be used.
+ If no `unique` columns exist, will attempt to use a single primary key column.
- Raises `MergeDispositionException` if no such column exists.
+ Returns:
+ str: Name of the column to be used as row key
+
+ Raises:
+ MergeDispositionException: If no suitable column is found based on the search criteria
"""
col = get_first_column_name_with_prop(table, "row_key")
- if col is None:
- col = cls._get_prop_col_or_raise(
- table,
- "unique",
- MergeDispositionException(
- sql_client.fully_qualified_dataset_name(),
- sql_client.fully_qualified_dataset_name(staging=True),
- [t["name"] for t in table_chain],
- f"No `row_key` or `unique` column (e.g. `_dlt_id`) in table `{table['name']}`.",
- ),
+ if col is not None:
+ return col
+
+ col = get_first_column_name_with_prop(table, "unique")
+ if col is not None:
+ return col
+
+ # Try to use a single primary key column as a fallback
+ primary_key_cols = get_columns_names_with_prop(table, "primary_key")
+ if len(primary_key_cols) == 1:
+ return primary_key_cols[0]
+ elif len(primary_key_cols) > 1:
+ raise MergeDispositionException(
+ sql_client.fully_qualified_dataset_name(),
+ sql_client.fully_qualified_dataset_name(staging=True),
+ [t["name"] for t in table_chain],
+ f"Multiple primary key columns found in table `{table['name']}`. "
+ "Cannot use as row_key.",
)
- return col
+
+ raise MergeDispositionException(
+ sql_client.fully_qualified_dataset_name(),
+ sql_client.fully_qualified_dataset_name(staging=True),
+ [t["name"] for t in table_chain],
+ "No `row_key`, `unique`, or single primary key column (e.g. `_dlt_id`) "
+ f"in table `{table['name']}`.",
+ )
@classmethod
def _get_root_key_col(
diff --git a/dlt/destinations/utils.py b/dlt/destinations/utils.py
index cd3ee6a54d..3874bf3ae3 100644
--- a/dlt/destinations/utils.py
+++ b/dlt/destinations/utils.py
@@ -222,3 +222,12 @@ def _convert_to_old_pyformat(
if count != len(args):
raise DatabaseTransientException(operational_error_cls())
return old_style_string, mapping
+
+
+def is_compression_disabled() -> bool:
+ from dlt import config
+
+ key_ = "normalize.data_writer.disable_compression"
+ if key_ not in config:
+ key_ = "data_writer.disable_compression"
+ return config.get(key_, bool)
diff --git a/dlt/extract/incremental/__init__.py b/dlt/extract/incremental/__init__.py
index 343a737c07..69af0d68a6 100644
--- a/dlt/extract/incremental/__init__.py
+++ b/dlt/extract/incremental/__init__.py
@@ -1,7 +1,7 @@
import os
from datetime import datetime # noqa: I251
-from typing import Generic, ClassVar, Any, Optional, Type, Dict
-from typing_extensions import get_origin, get_args
+from typing import Generic, ClassVar, Any, Optional, Type, Dict, Union
+from typing_extensions import get_args
import inspect
from functools import wraps
@@ -41,13 +41,13 @@
LastValueFunc,
OnCursorValueMissing,
)
-from dlt.extract.pipe import Pipe
from dlt.extract.items import SupportsPipe, TTableHintTemplate, ItemTransform
from dlt.extract.incremental.transform import (
JsonIncremental,
ArrowIncremental,
IncrementalTransform,
)
+from dlt.extract.incremental.lag import apply_lag
try:
from dlt.common.libs.pyarrow import is_arrow_item
@@ -101,6 +101,7 @@ class Incremental(ItemTransform[TDataItem], BaseConfiguration, Generic[TCursorVa
The values passed explicitly to Incremental will be ignored.
Note that if logical "end date" is present then also "end_value" will be set which means that resource state is not used and exactly this range of date will be loaded
on_cursor_value_missing: Specify what happens when the cursor_path does not exist in a record or a record has `None` at the cursor_path: raise, include, exclude
+ lag: Optional value used to define a lag or attribution window. For datetime cursors, this is interpreted as seconds. For other types, it uses the + or - operator depending on the last_value_func.
"""
# this is config/dataclass so declare members
@@ -111,6 +112,8 @@ class Incremental(ItemTransform[TDataItem], BaseConfiguration, Generic[TCursorVa
row_order: Optional[TSortOrder] = None
allow_external_schedulers: bool = False
on_cursor_value_missing: OnCursorValueMissing = "raise"
+ lag: Optional[float] = None
+ duplicate_cursor_warning_threshold: ClassVar[int] = 200
# incremental acting as empty
EMPTY: ClassVar["Incremental[Any]"] = None
@@ -126,6 +129,7 @@ def __init__(
row_order: Optional[TSortOrder] = None,
allow_external_schedulers: bool = False,
on_cursor_value_missing: OnCursorValueMissing = "raise",
+ lag: Optional[float] = None,
) -> None:
# make sure that path is valid
if cursor_path:
@@ -149,6 +153,8 @@ def __init__(
self._cached_state: IncrementalColumnState = None
"""State dictionary cached on first access"""
+
+ self.lag = lag
super().__init__(lambda x: x) # TODO:
self.end_out_of_range: bool = False
@@ -185,6 +191,7 @@ def _make_transforms(self) -> None:
self._primary_key,
set(self._cached_state["unique_hashes"]),
self.on_cursor_value_missing,
+ self.lag,
)
@classmethod
@@ -208,9 +215,14 @@ def merge(self, other: "Incremental[TCursorValue]") -> "Incremental[TCursorValue
>>> my_resource(updated=incremental(initial_value='2023-01-01', end_value='2023-02-01'))
"""
# func, resource name and primary key are not part of the dict
- kwargs = dict(self, last_value_func=self.last_value_func, primary_key=self._primary_key)
+ kwargs = dict(
+ self, last_value_func=self.last_value_func, primary_key=self._primary_key, lag=self.lag
+ )
for key, value in dict(
- other, last_value_func=other.last_value_func, primary_key=other.primary_key
+ other,
+ last_value_func=other.last_value_func,
+ primary_key=other.primary_key,
+ lag=other.lag,
).items():
if value is not None:
kwargs[key] = value
@@ -284,6 +296,7 @@ def parse_native_representation(self, native_value: Any) -> None:
self._primary_key = merged._primary_key
self.allow_external_schedulers = merged.allow_external_schedulers
self.row_order = merged.row_order
+ self.lag = merged.lag
self.__is_resolved__ = self.__is_resolved__
else: # TODO: Maybe check if callable(getattr(native_value, '__lt__', None))
# Passing bare value `incremental=44` gets parsed as initial_value
@@ -335,7 +348,25 @@ def _cursor_datetime_check(value: Any, arg_name: str) -> None:
@property
def last_value(self) -> Optional[TCursorValue]:
s = self.get_state()
- return s["last_value"] # type: ignore
+ last_value: TCursorValue = s["last_value"]
+
+ if self.lag:
+ if self.last_value_func not in (max, min):
+ logger.warning(
+ f"Lag on {self.resource_name} is only supported for max or min last_value_func."
+ f" Provided: {self.last_value_func}"
+ )
+ elif self.end_value is not None:
+ logger.info(
+ f"Lag on {self.resource_name} is deactivated if end_value is set in"
+ " incremental."
+ )
+ elif last_value is not None:
+ last_value = apply_lag(
+ self.lag, s["initial_value"], last_value, self.last_value_func
+ )
+
+ return last_value
def _transform_item(
self, transformer: IncrementalTransform, row: TDataItem
@@ -499,12 +530,28 @@ def __call__(self, rows: TDataItems, meta: Any = None) -> Optional[TDataItems]:
transformer.compute_unique_value(row, self.primary_key)
for row in transformer.last_rows
)
+ initial_hash_count = len(self._cached_state.get("unique_hashes", []))
# add directly computed hashes
unique_hashes.update(transformer.unique_hashes)
self._cached_state["unique_hashes"] = list(unique_hashes)
+ final_hash_count = len(self._cached_state["unique_hashes"])
+ self._check_duplicate_cursor_threshold(initial_hash_count, final_hash_count)
return rows
+ def _check_duplicate_cursor_threshold(
+ self, initial_hash_count: int, final_hash_count: int
+ ) -> None:
+ if initial_hash_count <= Incremental.duplicate_cursor_warning_threshold < final_hash_count:
+ logger.warning(
+ f"Large number of records ({final_hash_count}) sharing the same value of "
+ f"cursor field '{self.cursor_path}'. This can happen if the cursor "
+ "field has a low resolution (e.g., only stores dates without times), "
+ "causing many records to share the same cursor value. "
+ "Consider using a cursor column with higher resolution to reduce "
+ "the deduplication state size."
+ )
+
Incremental.EMPTY = Incremental[Any]()
Incremental.EMPTY.__is_resolved__ = True
diff --git a/dlt/extract/incremental/lag.py b/dlt/extract/incremental/lag.py
new file mode 100644
index 0000000000..ee102a9961
--- /dev/null
+++ b/dlt/extract/incremental/lag.py
@@ -0,0 +1,74 @@
+from datetime import datetime, timedelta, date # noqa: I251
+from typing import Union
+
+from dlt.common import logger
+from dlt.common.time import ensure_pendulum_datetime, detect_datetime_format
+
+from . import TCursorValue, LastValueFunc
+
+
+def _apply_lag_to_value(
+ lag: float, value: TCursorValue, last_value_func: LastValueFunc[TCursorValue]
+) -> TCursorValue:
+ """Applies lag to a value, in case of `str` types it attempts to return a string
+ with the lag applied preserving original format of a datetime/date
+ """
+ # Determine if the input is originally a string and capture its format
+ is_str = isinstance(value, str)
+ value_format = detect_datetime_format(value) if is_str else None
+ is_str_date = value_format in ("%Y%m%d", "%Y-%m-%d") if value_format else None
+ parsed_value = ensure_pendulum_datetime(value) if is_str else value
+
+ if isinstance(parsed_value, (datetime, date)):
+ parsed_value = _apply_lag_to_datetime(lag, parsed_value, last_value_func, is_str_date)
+ # go back to string or pass exact type
+ value = parsed_value.strftime(value_format) if value_format else parsed_value # type: ignore[assignment]
+
+ elif isinstance(parsed_value, (int, float)):
+ value = _apply_lag_to_number(lag, parsed_value, last_value_func) # type: ignore[assignment]
+
+ else:
+ logger.error(
+ f"Lag is not supported for cursor type: {type(value)} with last_value_func:"
+ f" {last_value_func}. Strings must parse to DateTime or Date."
+ )
+
+ return value
+
+
+def _apply_lag_to_datetime(
+ lag: float,
+ value: Union[datetime, date],
+ last_value_func: LastValueFunc[TCursorValue],
+ is_str_date: bool,
+) -> Union[datetime, date]:
+ if isinstance(value, datetime) and not is_str_date:
+ delta = timedelta(seconds=lag)
+ elif is_str_date or isinstance(value, date):
+ delta = timedelta(days=lag)
+ return value - delta if last_value_func is max else value + delta
+
+
+def _apply_lag_to_number(
+ lag: float, value: Union[int, float], last_value_func: LastValueFunc[TCursorValue]
+) -> Union[int, float]:
+ adjusted_value = value - lag if last_value_func is max else value + lag
+ return int(adjusted_value) if isinstance(value, int) else adjusted_value
+
+
+def apply_lag(
+ lag: float,
+ initial_value: TCursorValue,
+ last_value: TCursorValue,
+ last_value_func: LastValueFunc[TCursorValue],
+) -> TCursorValue:
+ """Applies lag to `last_value` but prevents it to cross `initial_value`: observing order of last_value_func"""
+ # Skip lag adjustment to avoid out-of-bounds issues
+ lagged_last_value = _apply_lag_to_value(lag, last_value, last_value_func)
+ if (
+ initial_value is not None
+ and last_value_func((initial_value, lagged_last_value)) == initial_value
+ ):
+ # do not cross initial_value
+ return initial_value
+ return lagged_last_value
diff --git a/dlt/extract/incremental/transform.py b/dlt/extract/incremental/transform.py
index 209caabc17..fe15571e41 100644
--- a/dlt/extract/incremental/transform.py
+++ b/dlt/extract/incremental/transform.py
@@ -57,6 +57,7 @@ def __init__(
primary_key: Optional[TTableHintTemplate[TColumnNames]],
unique_hashes: Set[str],
on_cursor_value_missing: OnCursorValueMissing = "raise",
+ lag: Optional[float] = None,
) -> None:
self.resource_name = resource_name
self.cursor_path = cursor_path
@@ -70,7 +71,7 @@ def __init__(
self.unique_hashes = unique_hashes
self.start_unique_hashes = set(unique_hashes)
self.on_cursor_value_missing = on_cursor_value_missing
-
+ self.lag = lag
# compile jsonpath
self._compiled_cursor_path = compile_path(cursor_path)
# for simple column name we'll fallback to search in dict
@@ -109,7 +110,14 @@ def __call__(
@property
def deduplication_disabled(self) -> bool:
- """Skip deduplication when length of the key is 0"""
+ """Skip deduplication when length of the key is 0 or if lag is applied."""
+ # disable deduplication if end value is set - state is not saved
+ if self.end_value is not None:
+ return True
+ # disable deduplication if lag is applied - destination must deduplicate ranges
+ if self.lag and self.last_value_func in (min, max):
+ return True
+ # disable deduplication if primary_key = ()
return isinstance(self.primary_key, (list, tuple)) and len(self.primary_key) == 0
diff --git a/dlt/extract/incremental/typing.py b/dlt/extract/incremental/typing.py
index 6829e6b370..7b7786b529 100644
--- a/dlt/extract/incremental/typing.py
+++ b/dlt/extract/incremental/typing.py
@@ -1,6 +1,6 @@
from typing_extensions import TypedDict
-from typing import Any, Callable, List, Literal, Optional, Sequence, TypeVar
+from typing import Any, Callable, List, Literal, Optional, Sequence, TypeVar, Union
from dlt.common.schema.typing import TColumnNames
from dlt.common.typing import TSortOrder
@@ -25,3 +25,4 @@ class IncrementalArgs(TypedDict, total=False):
end_value: Optional[str]
row_order: Optional[TSortOrder]
allow_external_schedulers: Optional[bool]
+ lag: Optional[Union[float, int]]
diff --git a/dlt/extract/source.py b/dlt/extract/source.py
index dd81717c71..1d984de3e4 100644
--- a/dlt/extract/source.py
+++ b/dlt/extract/source.py
@@ -495,6 +495,9 @@ def __call__(
"""Makes dlt source"""
pass
+ # TODO: make factory to expose SourceReference with actual spec, name and section
+ # model after Destination, which also needs to be broken down into reference and factory
+
def with_args(
self,
*,
@@ -511,6 +514,9 @@ def with_args(
"""Overrides default decorator arguments that will be used to when DltSource instance and returns modified clone."""
+AnySourceFactory = SourceFactory[Any, DltSource]
+
+
class SourceReference:
"""Runtime information on the source/resource"""
@@ -518,7 +524,7 @@ class SourceReference:
"""A registry of all the decorated sources and resources discovered when importing modules"""
SPEC: Type[BaseConfiguration]
- f: SourceFactory[Any, DltSource]
+ f: AnySourceFactory
module: ModuleType
section: str
name: str
@@ -527,7 +533,7 @@ class SourceReference:
def __init__(
self,
SPEC: Type[BaseConfiguration],
- f: SourceFactory[Any, DltSource],
+ f: AnySourceFactory,
module: ModuleType,
section: str,
name: str,
@@ -569,7 +575,7 @@ def to_fully_qualified_ref(ref: str) -> List[str]:
def register(cls, ref_obj: "SourceReference") -> None:
ref = f"{ref_obj.context.name}.{ref_obj.section}.{ref_obj.name}"
if ref in cls.SOURCES:
- logger.warning(f"A source with ref {ref} is already registered and will be overwritten")
+ logger.info(f"A source with ref {ref} is already registered and will be overwritten")
cls.SOURCES[ref] = ref_obj
@classmethod
@@ -582,7 +588,7 @@ def find(cls, ref: str) -> "SourceReference":
raise KeyError(refs)
@classmethod
- def from_reference(cls, ref: str) -> SourceFactory[Any, DltSource]:
+ def from_reference(cls, ref: str) -> AnySourceFactory:
"""Returns registered source factory or imports source module and returns a function.
Expands shorthand notation into section.name eg. "sql_database" is expanded into "sql_database.sql_database"
"""
diff --git a/dlt/load/load.py b/dlt/load/load.py
index 73117e5499..060b2c5d8e 100644
--- a/dlt/load/load.py
+++ b/dlt/load/load.py
@@ -39,7 +39,7 @@
TLoadJobState,
DestinationClientConfiguration,
SupportsStagingDestination,
- TDestination,
+ AnyDestination,
)
from dlt.common.destination.exceptions import (
DestinationTerminalException,
@@ -73,8 +73,8 @@ class Load(Runnable[Executor], WithStepInfo[LoadMetrics, LoadInfo]):
@with_config(spec=LoaderConfiguration, sections=(known_sections.LOAD,))
def __init__(
self,
- destination: TDestination,
- staging_destination: TDestination = None,
+ destination: AnyDestination,
+ staging_destination: AnyDestination = None,
collector: Collector = NULL_COLLECTOR,
is_storage_owner: bool = False,
config: LoaderConfiguration = config.value,
diff --git a/dlt/pipeline/pipeline.py b/dlt/pipeline/pipeline.py
index b66c711936..037458f9c1 100644
--- a/dlt/pipeline/pipeline.py
+++ b/dlt/pipeline/pipeline.py
@@ -15,8 +15,6 @@
cast,
get_type_hints,
ContextManager,
- Dict,
- Literal,
)
from dlt import version
@@ -29,14 +27,12 @@
from dlt.common.configuration.exceptions import (
ConfigFieldMissingException,
ContextDefaultCannotBeCreated,
- ConfigurationValueError,
)
from dlt.common.configuration.specs.config_section_context import ConfigSectionContext
from dlt.common.destination.exceptions import (
DestinationIncompatibleLoaderFileFormatException,
DestinationNoStagingMode,
DestinationUndefinedEntity,
- DestinationCapabilitiesException,
)
from dlt.common.exceptions import MissingDependencyException
from dlt.common.runtime import signals, apply_runtime_config
@@ -69,7 +65,7 @@
from dlt.common.destination import (
DestinationCapabilitiesContext,
merge_caps_file_formats,
- TDestination,
+ AnyDestination,
LOADER_FILE_FORMATS,
TLoaderFileFormat,
)
@@ -102,7 +98,7 @@
TRefreshMode,
)
from dlt.common.schema import Schema
-from dlt.common.utils import is_interactive
+from dlt.common.utils import make_defunct_class, is_interactive
from dlt.common.warnings import deprecated, Dlt04DeprecationWarning
from dlt.common.versioned_state import json_encode_state, json_decode_state
@@ -298,9 +294,9 @@ class Pipeline(SupportsPipeline):
pipeline_name: str
"""Name of the pipeline"""
- default_schema_name: str = None
- schema_names: List[str] = []
- first_run: bool = False
+ default_schema_name: str
+ schema_names: List[str]
+ first_run: bool
"""Indicates a first run of the pipeline, where run ends with successful loading of the data"""
dev_mode: bool
must_attach_to_local_pipeline: bool
@@ -308,25 +304,25 @@ class Pipeline(SupportsPipeline):
"""A directory where the pipelines' working directories are created"""
working_dir: str
"""A working directory of the pipeline"""
- destination: TDestination = None
- staging: TDestination = None
+ _destination: AnyDestination
+ _staging: AnyDestination
"""The destination reference which is the Destination Class. `destination.destination_name` returns the name string"""
- dataset_name: str = None
+ dataset_name: str
"""Name of the dataset to which pipeline will be loaded to"""
- is_active: bool = False
+ is_active: bool
"""Tells if instance is currently active and available via dlt.pipeline()"""
collector: _Collector
config: PipelineConfiguration
runtime_config: RuntimeConfiguration
- refresh: Optional[TRefreshMode] = None
+ refresh: Optional[TRefreshMode]
def __init__(
self,
pipeline_name: str,
pipelines_dir: str,
pipeline_salt: TSecretStrValue,
- destination: TDestination,
- staging: TDestination,
+ destination: AnyDestination,
+ staging: AnyDestination,
dataset_name: str,
import_schema_path: str,
export_schema_path: str,
@@ -338,13 +334,19 @@ def __init__(
refresh: Optional[TRefreshMode] = None,
) -> None:
"""Initializes the Pipeline class which implements `dlt` pipeline. Please use `pipeline` function in `dlt` module to create a new Pipeline instance."""
+ self.default_schema_name = None
+ self.schema_names = []
+ self.first_run = False
+ self.dataset_name: str = None
+ self.is_active = False
+
self.pipeline_salt = pipeline_salt
self.config = config
self.runtime_config = runtime
self.dev_mode = dev_mode
self.collector = progress or _NULL_COLLECTOR
- self.destination = None
- self.staging = None
+ self._destination = None
+ self._staging = None
self.refresh = refresh
self._container = Container()
@@ -369,20 +371,24 @@ def __init__(
self._set_dataset_name(dataset_name)
def drop(self, pipeline_name: str = None) -> "Pipeline":
- """Deletes local pipeline state, schemas and any working files.
+ """Deletes local pipeline state, schemas and any working files. Re-initializes
+ all internal fields via __init__. If `pipeline_name` is specified that is
+ different from the current name, new pipeline instance is created, activated and returned.
+ Note that original pipeline is still dropped.
Args:
- pipeline_name (str): Optional. New pipeline name.
+ pipeline_name (str): Optional. New pipeline name. Creates and activates new instance
"""
+ if self.is_active:
+ self.deactivate()
# reset the pipeline working dir
self._create_pipeline()
- # clone the pipeline
- return Pipeline(
- pipeline_name or self.pipeline_name,
+ self.__init__( # type: ignore[misc]
+ self.pipeline_name,
self.pipelines_dir,
self.pipeline_salt,
- self.destination,
- self.staging,
+ self._destination,
+ self._staging,
self.dataset_name,
self._schema_storage.config.import_schema_path,
self._schema_storage.config.export_schema_path,
@@ -392,6 +398,25 @@ def drop(self, pipeline_name: str = None) -> "Pipeline":
self.config,
self.runtime_config,
)
+ if pipeline_name is not None and pipeline_name != self.pipeline_name:
+ self = self.__class__(
+ pipeline_name,
+ self.pipelines_dir,
+ self.pipeline_salt,
+ deepcopy(self._destination),
+ deepcopy(self._staging),
+ self.dataset_name,
+ self._schema_storage.config.import_schema_path,
+ self._schema_storage.config.export_schema_path,
+ self.dev_mode,
+ deepcopy(self.collector),
+ False,
+ self.config,
+ self.runtime_config,
+ )
+ # activate (possibly new) self
+ self.activate()
+ return self
@with_runtime_trace()
@with_schemas_sync # this must precede with_state_sync
@@ -477,8 +502,8 @@ def _verify_destination_capabilities(
# verify loader file format
if loader_file_format and loader_file_format not in caps.supported_loader_file_formats:
raise DestinationIncompatibleLoaderFileFormatException(
- self.destination.destination_name,
- (self.staging.destination_name if self.staging else None),
+ self._destination.destination_name,
+ (self._staging.destination_name if self._staging else None),
loader_file_format,
set(caps.supported_loader_file_formats),
)
@@ -567,8 +592,8 @@ def load(
_load_storage_config=self._load_storage_config(),
)
load_step: Load = Load(
- self.destination,
- staging_destination=self.staging,
+ self._destination,
+ staging_destination=self._staging,
collector=self.collector,
is_storage_owner=False,
config=load_config,
@@ -684,7 +709,7 @@ def run(
self.config.restore_from_destination
and not self.dev_mode
and not self._state_restored
- and (self.destination or destination)
+ and (self._destination or destination)
):
self._sync_destination(destination, staging, dataset_name)
# sync only once
@@ -720,7 +745,7 @@ def run(
else:
return None
- @with_config_section(sections=None, merge_func=ConfigSectionContext.prefer_existing)
+ @with_config_section(sections=(), merge_func=ConfigSectionContext.prefer_existing)
def sync_destination(
self,
destination: TDestinationReferenceArg = None,
@@ -988,7 +1013,7 @@ def get_local_state_val(self, key: str) -> Any:
state = self._get_state()
return state["_local"][key] # type: ignore
- @with_config_section(sections=None, merge_func=ConfigSectionContext.prefer_existing)
+ @with_config_section(sections=(), merge_func=ConfigSectionContext.prefer_existing)
def sql_client(self, schema_name: str = None) -> SqlClientBase[Any]:
"""Returns a sql client configured to query/change the destination and dataset that were used to load the data.
Use the client with `with` statement to manage opening and closing connection to the destination:
@@ -1014,7 +1039,7 @@ def sql_client(self, schema_name: str = None) -> SqlClientBase[Any]:
if isinstance(client, WithSqlClient):
return client.sql_client
else:
- raise SqlClientNotAvailable(self.pipeline_name, self.destination.destination_name)
+ raise SqlClientNotAvailable(self.pipeline_name, self._destination.destination_name)
def _fs_client(self, schema_name: str = None) -> FSClientBase:
"""Returns a filesystem client configured to point to the right folder / bucket for each table.
@@ -1031,9 +1056,9 @@ def _fs_client(self, schema_name: str = None) -> FSClientBase:
client = self.destination_client(schema_name)
if isinstance(client, FSClientBase):
return client
- raise FSClientNotAvailable(self.pipeline_name, self.destination.destination_name)
+ raise FSClientNotAvailable(self.pipeline_name, self._destination.destination_name)
- @with_config_section(sections=None, merge_func=ConfigSectionContext.prefer_existing)
+ @with_config_section(sections=(), merge_func=ConfigSectionContext.prefer_existing)
def destination_client(self, schema_name: str = None) -> JobClientBase:
"""Get the destination job client for the configured destination
Use the client with `with` statement to manage opening and closing connection to the destination:
@@ -1046,6 +1071,28 @@ def destination_client(self, schema_name: str = None) -> JobClientBase:
schema = self._get_schema_or_create(schema_name)
return self._get_destination_clients(schema)[0]
+ @property
+ def destination(self) -> AnyDestination:
+ return self._destination
+
+ @destination.setter
+ def destination(self, new_value: AnyDestination) -> None:
+ self._destination = new_value
+ # bind pipeline to factory
+ if self._destination:
+ self._destination.config_params["bound_to_pipeline"] = self
+
+ @property
+ def staging(self) -> AnyDestination:
+ return self._staging
+
+ @staging.setter
+ def staging(self, new_value: AnyDestination) -> None:
+ self._staging = new_value
+ # bind pipeline to factory
+ if self._staging:
+ self._staging.config_params["bound_to_pipeline"] = self
+
def _get_schema_or_create(self, schema_name: str = None) -> Schema:
if schema_name:
return self.schemas[schema_name]
@@ -1060,7 +1107,7 @@ def _sql_job_client(self, schema: Schema) -> SqlJobClientBase:
if isinstance(client, SqlJobClientBase):
return client
else:
- raise SqlClientNotAvailable(self.pipeline_name, self.destination.destination_name)
+ raise SqlClientNotAvailable(self.pipeline_name, self._destination.destination_name)
def _get_normalize_storage(self) -> NormalizeStorage:
return NormalizeStorage(True, self._normalize_storage_config())
@@ -1203,9 +1250,9 @@ def _extract_source(
return load_id
def _get_destination_client_initial_config(
- self, destination: TDestination = None, as_staging: bool = False
+ self, destination: AnyDestination = None, as_staging: bool = False
) -> DestinationClientConfiguration:
- destination = destination or self.destination
+ destination = destination or self._destination
if not destination:
raise PipelineConfigMissing(
self.pipeline_name,
@@ -1234,7 +1281,14 @@ def _get_destination_client_initial_config(
)
else:
spec = client_spec()
- spec._bind_dataset_name(self.dataset_name, default_schema_name)
+ # in case of destination that does not need dataset name, we still must
+ # provide one to staging
+ # TODO: allow for separate staging_dataset_name, that will require to migrate pipeline state
+ # to store it.
+ dataset_name = self.dataset_name
+ if not dataset_name and as_staging:
+ dataset_name = self._make_dataset_name(None, destination)
+ spec._bind_dataset_name(dataset_name, default_schema_name)
return spec
return client_spec()
@@ -1248,29 +1302,29 @@ def _get_destination_clients(
try:
# resolve staging config in order to pass it to destination client config
staging_client = None
- if self.staging:
+ if self._staging:
if not initial_staging_config:
# this is just initial config - without user configuration injected
initial_staging_config = self._get_destination_client_initial_config(
- self.staging, as_staging=True
+ self._staging, as_staging=True
)
# create the client - that will also resolve the config
- staging_client = self.staging.client(schema, initial_staging_config)
+ staging_client = self._staging.client(schema, initial_staging_config)
if not initial_config:
# config is not provided then get it with injected credentials
- initial_config = self._get_destination_client_initial_config(self.destination)
+ initial_config = self._get_destination_client_initial_config(self._destination)
# attach the staging client config to destination client config - if its type supports it
if (
- self.staging
+ self._staging
and isinstance(initial_config, DestinationClientDwhWithStagingConfiguration)
and isinstance(staging_client.config, DestinationClientStagingConfiguration)
):
initial_config.staging_config = staging_client.config
# create instance with initial_config properly set
- client = self.destination.client(schema, initial_config)
+ client = self._destination.client(schema, initial_config)
return client, staging_client
except ModuleNotFoundError:
- client_spec = self.destination.spec()
+ client_spec = self._destination.spec()
raise MissingDependencyException(
f"{client_spec.destination_type} destination",
[f"{version.DLT_PKG_NAME}[{client_spec.destination_type}]"],
@@ -1278,7 +1332,7 @@ def _get_destination_clients(
)
def _get_destination_capabilities(self) -> DestinationCapabilitiesContext:
- if not self.destination:
+ if not self._destination:
raise PipelineConfigMissing(
self.pipeline_name,
"destination",
@@ -1294,10 +1348,10 @@ def _get_destination_capabilities(self) -> DestinationCapabilitiesContext:
naming = self.default_schema.naming
else:
naming = None
- return self.destination.capabilities(naming=naming)
+ return self._destination.capabilities(naming=naming)
def _get_staging_capabilities(self) -> Optional[DestinationCapabilitiesContext]:
- if self.staging is None:
+ if self._staging is None:
return None
# check if default schema is present
if (
@@ -1307,7 +1361,7 @@ def _get_staging_capabilities(self) -> Optional[DestinationCapabilitiesContext]:
naming = self.default_schema.naming
else:
naming = None
- return self.staging.capabilities(naming=naming)
+ return self._staging.capabilities(naming=naming)
def _validate_pipeline_name(self) -> None:
try:
@@ -1331,7 +1385,7 @@ def _set_context(self, is_active: bool) -> None:
self.is_active = is_active
if is_active:
# set destination context on activation
- if self.destination:
+ if self._destination:
# inject capabilities context
self._container[DestinationCapabilitiesContext] = (
self._get_destination_capabilities()
@@ -1350,34 +1404,35 @@ def _set_destinations(
initializing: bool = False,
destination_credentials: Any = None,
) -> None:
- destination_changed = destination is not None and destination != self.destination
+ destination_changed = destination is not None and destination != self._destination
# set destination if provided but do not swap if factory is the same
if destination_changed:
- self.destination = Destination.from_reference(
+ self._destination = Destination.from_reference(
destination, destination_name=destination_name
)
if (
- self.destination
- and not self.destination.capabilities().supported_loader_file_formats
+ self._destination
+ and not self._destination.capabilities().supported_loader_file_formats
and not staging
- and not self.staging
+ and not self._staging
):
logger.warning(
- f"The destination {self.destination.destination_name} requires the filesystem"
+ f"The destination {self._destination.destination_name} requires the filesystem"
" staging destination to be set, but it was not provided. Setting it to"
" 'filesystem'."
)
staging = "filesystem"
staging_name = "filesystem"
- staging_changed = staging is not None and staging != self.staging
+ staging_changed = staging is not None and staging != self._staging
if staging_changed:
staging_module = Destination.from_reference(staging, destination_name=staging_name)
if staging_module and not issubclass(
staging_module.spec, DestinationClientStagingConfiguration
):
raise DestinationNoStagingMode(staging_module.destination_name)
+ # set via property
self.staging = staging_module
if staging_changed or destination_changed:
@@ -1391,8 +1446,11 @@ def _set_destinations(
if not initializing:
self._set_context(is_active=True)
# apply explicit credentials
- if self.destination and destination_credentials:
- self.destination.config_params["credentials"] = destination_credentials
+ if self._destination:
+ if destination_credentials:
+ self._destination.config_params["credentials"] = destination_credentials
+ # set via property
+ self.destination = self._destination
@contextmanager
def _maybe_destination_capabilities(
@@ -1401,7 +1459,7 @@ def _maybe_destination_capabilities(
caps: DestinationCapabilitiesContext = None
injected_caps: ContextManager[DestinationCapabilitiesContext] = None
try:
- if self.destination:
+ if self._destination:
destination_caps = self._get_destination_capabilities()
stage_caps = self._get_staging_capabilities()
injected_caps = self._container.injectable_context(destination_caps)
@@ -1409,8 +1467,8 @@ def _maybe_destination_capabilities(
caps.preferred_loader_file_format, caps.supported_loader_file_formats = (
merge_caps_file_formats(
- self.destination.destination_name,
- (self.staging.destination_name if self.staging else None),
+ self._destination.destination_name,
+ (self._staging.destination_name if self._staging else None),
destination_caps,
stage_caps,
)
@@ -1420,23 +1478,30 @@ def _maybe_destination_capabilities(
if injected_caps:
injected_caps.__exit__(None, None, None)
- def _set_dataset_name(self, new_dataset_name: str) -> None:
- if not new_dataset_name and not self.dataset_name:
+ def _set_dataset_name(self, new_dataset_name: Optional[str]) -> None:
+ if new_dataset_name or not self.dataset_name:
+ self.dataset_name = self._make_dataset_name(new_dataset_name, self._destination)
+
+ def _make_dataset_name(
+ self, new_dataset_name: Optional[str], destination: Optional[AnyDestination]
+ ) -> str:
+ """Generates dataset name for the pipeline based on `new_dataset_name`
+ 1. if name is not provided, default name is created
+ 2. for destinations that do not need dataset names, def. name is not created
+ 3. we add serial number in dev mode
+ 4. we apply layout from pipeline config if present
+ """
+ if not new_dataset_name:
# dataset name is required but not provided - generate the default now
destination_needs_dataset = False
- if self.destination:
- fields = self.destination.spec().get_resolvable_fields()
- dataset_name_type = fields.get("dataset_name")
- # if dataset is required (default!) we create a default dataset name
- destination_needs_dataset = dataset_name_type is not None and not is_optional_type(
- dataset_name_type
- )
+ if destination and issubclass(destination.spec, DestinationClientDwhConfiguration):
+ destination_needs_dataset = destination.spec.needs_dataset_name()
# if destination is not specified - generate dataset
- if not self.destination or destination_needs_dataset:
+ if destination_needs_dataset:
new_dataset_name = self.pipeline_name + self.DEFAULT_DATASET_SUFFIX
if not new_dataset_name:
- return
+ return new_dataset_name
# in case of dev_mode add unique suffix
if self.dev_mode:
@@ -1446,11 +1511,11 @@ def _set_dataset_name(self, new_dataset_name: str) -> None:
new_dataset_name += self._pipeline_instance_id[1:]
else:
new_dataset_name += self._pipeline_instance_id
- self.dataset_name = new_dataset_name
# normalizes the dataset name using the dataset_name_layout
if self.config.dataset_name_layout:
- self.dataset_name = self.config.dataset_name_layout % self.dataset_name
+ new_dataset_name = self.config.dataset_name_layout % new_dataset_name
+ return new_dataset_name
def _set_default_schema_name(self, schema: Schema) -> None:
assert self.default_schema_name is None
@@ -1474,17 +1539,21 @@ def _get_step_info(self, step: WithStepInfo[TStepMetrics, TStepInfo]) -> TStepIn
def _get_state(self) -> TPipelineState:
try:
state = json_decode_state(self._pipeline_storage.load(Pipeline.STATE_FILE))
- return migrate_pipeline_state(
+ migrated_state = migrate_pipeline_state(
self.pipeline_name,
state,
state["_state_engine_version"],
PIPELINE_STATE_ENGINE_VERSION,
)
+ # TODO: move to a migration. this change is local and too small to justify
+ # engine upgrade
+ _local = migrated_state["_local"]
+ if "initial_cwd" not in _local:
+ _local["initial_cwd"] = os.path.abspath(os.path.curdir)
+ return migrated_state
except FileNotFoundError:
# do not set the state hash, this will happen on first merge
return default_pipeline_state()
- # state["_version_hash"] = generate_version_hash(state)
- # return state
def _optional_sql_job_client(self, schema_name: str) -> Optional[SqlJobClientBase]:
try:
@@ -1516,18 +1585,18 @@ def _restore_state_from_destination(self) -> Optional[TPipelineState]:
if state is None:
logger.info(
"The state was not found in the destination"
- f" {self.destination.destination_description}:{dataset_name}"
+ f" {self._destination.destination_description}:{dataset_name}"
)
else:
logger.info(
"The state was restored from the destination"
- f" {self.destination.destination_description}:{dataset_name}"
+ f" {self._destination.destination_description}:{dataset_name}"
)
else:
state = None
logger.info(
"Destination does not support state sync"
- f" {self.destination.destination_description}:{dataset_name}"
+ f" {self._destination.destination_description}:{dataset_name}"
)
return state
finally:
@@ -1547,14 +1616,14 @@ def _get_schemas_from_destination(
if not isinstance(job_client, WithStateSync):
logger.info(
"Destination does not support restoring of pipeline state"
- f" {self.destination.destination_name}"
+ f" {self._destination.destination_name}"
)
return restored_schemas
schema_info = job_client.get_stored_schema(schema_name)
if schema_info is None:
logger.info(
f"The schema {schema.name} was not found in the destination"
- f" {self.destination.destination_name}:{self.dataset_name}"
+ f" {self._destination.destination_name}:{self.dataset_name}"
)
# try to import schema
with contextlib.suppress(FileNotFoundError):
@@ -1564,7 +1633,7 @@ def _get_schemas_from_destination(
logger.info(
f"The schema {schema.name} version {schema.version} hash"
f" {schema.stored_version_hash} was restored from the destination"
- f" {self.destination.destination_name}:{self.dataset_name}"
+ f" {self._destination.destination_name}:{self.dataset_name}"
)
restored_schemas.append(schema)
return restored_schemas
@@ -1601,7 +1670,7 @@ def _state_to_props(self, state: TPipelineState) -> None:
if prop in state["_local"] and not prop.startswith("_"):
setattr(self, prop, state["_local"][prop]) # type: ignore
# staging and destination are taken from state only if not yet set in the pipeline
- if not self.destination:
+ if not self._destination:
self._set_destinations(
destination=state.get("destination_type"),
destination_name=state.get("destination_name"),
@@ -1612,11 +1681,11 @@ def _state_to_props(self, state: TPipelineState) -> None:
# issue warnings that state destination/staging got ignored
state_destination = state.get("destination_type")
if state_destination:
- if self.destination.destination_type != state_destination:
+ if self._destination.destination_type != state_destination:
logger.warning(
f"The destination {state_destination}:{state.get('destination_name')} in"
" state differs from destination"
- f" {self.destination.destination_type}:{self.destination.destination_name} in"
+ f" {self._destination.destination_type}:{self._destination.destination_name} in"
" pipeline and will be ignored"
)
state_staging = state.get("staging_type")
@@ -1634,12 +1703,12 @@ def _props_to_state(self, state: TPipelineState) -> TPipelineState:
for prop in Pipeline.LOCAL_STATE_PROPS:
if not prop.startswith("_"):
state["_local"][prop] = getattr(self, prop) # type: ignore
- if self.destination:
- state["destination_type"] = self.destination.destination_type
- state["destination_name"] = self.destination.destination_name
- if self.staging:
- state["staging_type"] = self.staging.destination_type
- state["staging_name"] = self.staging.destination_name
+ if self._destination:
+ state["destination_type"] = self._destination.destination_type
+ state["destination_name"] = self._destination.destination_name
+ if self._staging:
+ state["staging_type"] = self._staging.destination_type
+ state["staging_name"] = self._staging.destination_name
state["schema_names"] = self._list_schemas_sorted()
return state
@@ -1724,7 +1793,7 @@ def __getstate__(self) -> Any:
def _dataset(self, dataset_type: TDatasetType = "dbapi") -> SupportsReadableDataset:
"""Access helper to dataset"""
return dataset(
- self.destination,
+ self._destination,
self.dataset_name,
schema=(self.default_schema if self.default_schema_name else None),
dataset_type=dataset_type,
diff --git a/dlt/pipeline/state_sync.py b/dlt/pipeline/state_sync.py
index 11648328f2..81fad89a14 100644
--- a/dlt/pipeline/state_sync.py
+++ b/dlt/pipeline/state_sync.py
@@ -1,3 +1,4 @@
+import os
from copy import copy
from typing import Tuple, cast
@@ -129,5 +130,5 @@ def default_pipeline_state() -> TPipelineState:
return {
**default_versioned_state(),
"_state_engine_version": PIPELINE_STATE_ENGINE_VERSION,
- "_local": {"first_run": True},
+ "_local": {"first_run": True, "initial_cwd": os.path.abspath(os.path.curdir)},
}
diff --git a/dlt/sources/pipeline_templates/__init__.py b/dlt/sources/_core_source_templates/__init__.py
similarity index 100%
rename from dlt/sources/pipeline_templates/__init__.py
rename to dlt/sources/_core_source_templates/__init__.py
diff --git a/dlt/sources/filesystem_pipeline.py b/dlt/sources/_core_source_templates/filesystem_pipeline.py
similarity index 100%
rename from dlt/sources/filesystem_pipeline.py
rename to dlt/sources/_core_source_templates/filesystem_pipeline.py
diff --git a/dlt/sources/rest_api_pipeline.py b/dlt/sources/_core_source_templates/rest_api_pipeline.py
similarity index 100%
rename from dlt/sources/rest_api_pipeline.py
rename to dlt/sources/_core_source_templates/rest_api_pipeline.py
diff --git a/dlt/sources/sql_database_pipeline.py b/dlt/sources/_core_source_templates/sql_database_pipeline.py
similarity index 100%
rename from dlt/sources/sql_database_pipeline.py
rename to dlt/sources/_core_source_templates/sql_database_pipeline.py
diff --git a/dlt/sources/pipeline_templates/.dlt/config.toml b/dlt/sources/_single_file_templates/.dlt/config.toml
similarity index 100%
rename from dlt/sources/pipeline_templates/.dlt/config.toml
rename to dlt/sources/_single_file_templates/.dlt/config.toml
diff --git a/dlt/sources/pipeline_templates/.gitignore b/dlt/sources/_single_file_templates/.gitignore
similarity index 100%
rename from dlt/sources/pipeline_templates/.gitignore
rename to dlt/sources/_single_file_templates/.gitignore
diff --git a/dlt/sources/_single_file_templates/__init__.py b/dlt/sources/_single_file_templates/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/dlt/sources/pipeline_templates/arrow_pipeline.py b/dlt/sources/_single_file_templates/arrow_pipeline.py
similarity index 100%
rename from dlt/sources/pipeline_templates/arrow_pipeline.py
rename to dlt/sources/_single_file_templates/arrow_pipeline.py
diff --git a/dlt/sources/pipeline_templates/dataframe_pipeline.py b/dlt/sources/_single_file_templates/dataframe_pipeline.py
similarity index 100%
rename from dlt/sources/pipeline_templates/dataframe_pipeline.py
rename to dlt/sources/_single_file_templates/dataframe_pipeline.py
diff --git a/dlt/sources/pipeline_templates/debug_pipeline.py b/dlt/sources/_single_file_templates/debug_pipeline.py
similarity index 100%
rename from dlt/sources/pipeline_templates/debug_pipeline.py
rename to dlt/sources/_single_file_templates/debug_pipeline.py
diff --git a/dlt/sources/pipeline_templates/default_pipeline.py b/dlt/sources/_single_file_templates/default_pipeline.py
similarity index 100%
rename from dlt/sources/pipeline_templates/default_pipeline.py
rename to dlt/sources/_single_file_templates/default_pipeline.py
diff --git a/dlt/sources/pipeline_templates/fruitshop_pipeline.py b/dlt/sources/_single_file_templates/fruitshop_pipeline.py
similarity index 100%
rename from dlt/sources/pipeline_templates/fruitshop_pipeline.py
rename to dlt/sources/_single_file_templates/fruitshop_pipeline.py
diff --git a/dlt/sources/pipeline_templates/github_api_pipeline.py b/dlt/sources/_single_file_templates/github_api_pipeline.py
similarity index 83%
rename from dlt/sources/pipeline_templates/github_api_pipeline.py
rename to dlt/sources/_single_file_templates/github_api_pipeline.py
index 80cac0c525..1423a8dc3a 100644
--- a/dlt/sources/pipeline_templates/github_api_pipeline.py
+++ b/dlt/sources/_single_file_templates/github_api_pipeline.py
@@ -12,11 +12,11 @@
@dlt.resource(write_disposition="replace")
-def github_api_resource(api_secret_key: Optional[str] = dlt.secrets.value):
+def github_api_resource(access_token: Optional[str] = dlt.secrets.value):
url = "https://api.github.com/repos/dlt-hub/dlt/issues"
# Github allows both authenticated and non-authenticated requests (with low rate limits)
- auth = BearerTokenAuth(api_secret_key) if api_secret_key else None
+ auth = BearerTokenAuth(access_token) if access_token else None
for page in paginate(
url, auth=auth, paginator=HeaderLinkPaginator(), params={"state": "open", "per_page": "100"}
):
@@ -24,8 +24,8 @@ def github_api_resource(api_secret_key: Optional[str] = dlt.secrets.value):
@dlt.source
-def github_api_source(api_secret_key: Optional[str] = dlt.secrets.value):
- return github_api_resource(api_secret_key=api_secret_key)
+def github_api_source(access_token: Optional[str] = dlt.secrets.value):
+ return github_api_resource(access_token=access_token)
def run_source() -> None:
diff --git a/dlt/sources/pipeline_templates/requests_pipeline.py b/dlt/sources/_single_file_templates/requests_pipeline.py
similarity index 100%
rename from dlt/sources/pipeline_templates/requests_pipeline.py
rename to dlt/sources/_single_file_templates/requests_pipeline.py
diff --git a/dlt/sources/helpers/rest_client/paginators.py b/dlt/sources/helpers/rest_client/paginators.py
index 82b97e253b..a4290e0b9b 100644
--- a/dlt/sources/helpers/rest_client/paginators.py
+++ b/dlt/sources/helpers/rest_client/paginators.py
@@ -651,7 +651,7 @@ def __init__(
def update_state(self, response: Response, data: Optional[List[Any]] = None) -> None:
"""Extracts the cursor value from the JSON response."""
values = jsonpath.find_values(self.cursor_path, response.json())
- self._next_reference = values[0] if values else None
+ self._next_reference = values[0] if values and values[0] else None
def update_request(self, request: Request) -> None:
"""Updates the request with the cursor query parameter."""
diff --git a/dlt/sources/rest_api/__init__.py b/dlt/sources/rest_api/__init__.py
index 77e98f55d8..ed55f71e10 100644
--- a/dlt/sources/rest_api/__init__.py
+++ b/dlt/sources/rest_api/__init__.py
@@ -9,6 +9,7 @@
from dlt.common import jsonpath
from dlt.common.schema.schema import Schema
from dlt.common.schema.typing import TSchemaContract
+from dlt.common.utils import exclude_keys
from dlt.extract import Incremental, DltResource, DltSource, decorators
@@ -44,7 +45,7 @@
setup_incremental_object,
create_response_hooks,
)
-from .utils import check_connection, exclude_keys # noqa: F401
+from .utils import check_connection # noqa: F401
PARAM_TYPES: List[ParamBindType] = ["incremental", "resolve"]
MIN_SECRET_MASKING_LENGTH = 3
diff --git a/dlt/sources/rest_api/config_setup.py b/dlt/sources/rest_api/config_setup.py
index b11f2799b9..d03a4fd59b 100644
--- a/dlt/sources/rest_api/config_setup.py
+++ b/dlt/sources/rest_api/config_setup.py
@@ -19,7 +19,7 @@
from dlt.common import logger
from dlt.common.configuration import resolve_configuration
from dlt.common.schema.utils import merge_columns
-from dlt.common.utils import update_dict_nested
+from dlt.common.utils import update_dict_nested, exclude_keys
from dlt.common import jsonpath
from dlt.extract.incremental import Incremental
@@ -65,7 +65,6 @@
Endpoint,
EndpointResource,
)
-from .utils import exclude_keys
PAGINATOR_MAP: Dict[str, Type[BasePaginator]] = {
diff --git a/dlt/sources/rest_api/utils.py b/dlt/sources/rest_api/utils.py
index c1ef181cca..02108bf876 100644
--- a/dlt/sources/rest_api/utils.py
+++ b/dlt/sources/rest_api/utils.py
@@ -1,4 +1,4 @@
-from typing import Tuple, Dict, Any, Mapping, Iterable
+from typing import Tuple
from dlt.common import logger
from dlt.extract.source import DltSource
@@ -10,19 +10,6 @@ def join_url(base_url: str, path: str) -> str:
return base_url + path.lstrip("/")
-def exclude_keys(d: Mapping[str, Any], keys: Iterable[str]) -> Dict[str, Any]:
- """Removes specified keys from a dictionary and returns a new dictionary.
-
- Args:
- d (Mapping[str, Any]): The dictionary to remove keys from.
- keys (Iterable[str]): The keys to remove.
-
- Returns:
- Dict[str, Any]: A new dictionary with the specified keys removed.
- """
- return {k: v for k, v in d.items() if k not in keys}
-
-
def check_connection(
source: DltSource,
*resource_names: str,
diff --git a/docs/examples/incremental_loading/incremental_loading.py b/docs/examples/incremental_loading/incremental_loading.py
index 90c5e93347..cd82f5d6c9 100644
--- a/docs/examples/incremental_loading/incremental_loading.py
+++ b/docs/examples/incremental_loading/incremental_loading.py
@@ -26,7 +26,7 @@
from dlt.common import pendulum
from dlt.common.time import ensure_pendulum_datetime
from dlt.common.typing import TAnyDateTime
-from dlt.sources.helpers.requests import client
+from dlt.sources.helpers import requests
@dlt.source(max_table_nesting=2)
@@ -123,7 +123,7 @@ def get_pages(
# make request and keep looping until there is no next page
get_url = f"{url}{endpoint}"
while get_url:
- response = client.get(get_url, headers=headers, auth=auth, params=params)
+ response = requests.get(get_url, headers=headers, auth=auth, params=params)
response.raise_for_status()
response_json = response.json()
result = response_json[data_point_name]
@@ -146,4 +146,16 @@ def get_pages(
# check that stuff was loaded
row_counts = pipeline.last_trace.last_normalize_info.row_counts
- assert row_counts["ticket_events"] == 17
+ assert row_counts["ticket_events"] > 0, "No ticket events were loaded"
+
+ with pipeline.sql_client() as client:
+ results = client.execute("""
+ SELECT
+ COUNT(DISTINCT ticket_id) as unique_tickets,
+ COUNT(DISTINCT event_type) as event_types,
+ FROM ticket_events
+ """).fetchone()
+
+ unique_tickets, event_types = results
+ assert unique_tickets > 0, "No unique tickets were loaded"
+ assert event_types > 0, "No event types were found"
diff --git a/docs/examples/partial_loading/.dlt/config.toml b/docs/examples/partial_loading/.dlt/config.toml
new file mode 100644
index 0000000000..dad6cffd19
--- /dev/null
+++ b/docs/examples/partial_loading/.dlt/config.toml
@@ -0,0 +1,2 @@
+[destination.filesystem]
+bucket_url="s3://dlt-ci-test-bucket"
\ No newline at end of file
diff --git a/docs/examples/partial_loading/.dlt/example.secrets.toml b/docs/examples/partial_loading/.dlt/example.secrets.toml
new file mode 100644
index 0000000000..811614e687
--- /dev/null
+++ b/docs/examples/partial_loading/.dlt/example.secrets.toml
@@ -0,0 +1,3 @@
+[destination.filesystem.credentials]
+aws_access_key_id = "" # copy the access key here
+aws_secret_access_key = "" # copy the secret access key here
\ No newline at end of file
diff --git a/docs/examples/partial_loading/__init__.py b/docs/examples/partial_loading/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/docs/examples/partial_loading/partial_loading.py b/docs/examples/partial_loading/partial_loading.py
new file mode 100644
index 0000000000..1d73a2aec0
--- /dev/null
+++ b/docs/examples/partial_loading/partial_loading.py
@@ -0,0 +1,159 @@
+"""
+---
+title: Backfill to Filesystem with partial replace
+description: Load chess game data from Chess.com into a filesystem destination, while deleting old backfill files.
+keywords: [incremental loading, REST API, dlt, chess.com, data pipeline, backfill management, filesystem]
+---
+
+This script interacts with the Chess.com REST API to extract game data for a specific user on a monthly basis.
+The script retrieves game data for a specified time range, and when additional data is loaded for a different time range,
+it automatically handles de-duplication by deleting any previously loaded files for overlapping time range.
+
+We'll learn:
+
+- How to configure a [REST API source](../dlt-ecosystem/verified-sources/rest_api/basic.md) using
+ the `dlt` library.
+- How to manage and delete old backfill files for de-duplication.
+- How to use [Filesystem](../dlt-ecosystem/destinations/filesystem.md) as a destination for storing extracted data.
+"""
+
+import os
+import re
+from dlt.common import pendulum as p
+from typing import Dict, List, Iterator
+
+import dlt
+from dlt.sources import DltResource
+from dlt.common.pipeline import LoadInfo
+from dlt.destinations.impl.filesystem.filesystem import FilesystemClient
+from dlt.sources.rest_api import RESTAPIConfig, rest_api_resources
+
+
+@dlt.source
+def chess_com_source(username: str, months: List[Dict[str, str]]) -> Iterator[DltResource]:
+ """
+ Configures and yields resources to fetch chess game data for a given user across specified months.
+
+ Args:
+ username (str): Chess.com username to fetch games for.
+ months (List[Dict[str, str]]): List of dictionaries containing 'year' and 'month' keys.
+
+ Yields:
+ dlt.Resource: Resource objects containing fetched game data.
+ """
+ for month in months:
+ year = month["year"]
+ month_str = month["month"]
+ # Configure REST API endpoint for the specific month
+ config: RESTAPIConfig = {
+ "client": {
+ "base_url": "https://api.chess.com/pub/", # Base URL for Chess.com API
+ },
+ "resources": [
+ {
+ "name": f"chess_com_games_{year}_{month_str}", # Unique resource name
+ "write_disposition": "append",
+ "endpoint": {
+ "path": f"player/{username}/games/{year}/{month_str}", # API endpoint path
+ },
+ "primary_key": ["url"], # Primary key to prevent duplicates
+ }
+ ],
+ }
+ yield from rest_api_resources(config)
+
+
+def generate_months(
+ start_year: int, start_month: int, end_year: int, end_month: int
+) -> Iterator[Dict[str, str]]:
+ """
+ Generates a list of months between the start and end dates.
+
+ Args:
+ start_year (int): Starting year.
+ start_month (int): Starting month.
+ end_year (int): Ending year.
+ end_month (int): Ending month.
+
+ Yields:
+ Dict[str, str]: Dictionary containing 'year' and 'month' as strings.
+ """
+ start_date = p.datetime(start_year, start_month, 1)
+ end_date = p.datetime(end_year, end_month, 1)
+ current_date = start_date
+ while current_date <= end_date:
+ yield {"year": str(current_date.year), "month": f"{current_date.month:02d}"}
+ # Move to the next month
+ if current_date.month == 12:
+ current_date = current_date.replace(year=current_date.year + 1, month=1)
+ else:
+ current_date = current_date.replace(month=current_date.month + 1)
+
+
+def delete_old_backfills(load_info: LoadInfo, p: dlt.Pipeline, table_name: str) -> None:
+ """
+ Deletes old backfill files that do not match the current load ID to maintain data integrity.
+
+ Args:
+ load_info (LoadInfo): Information about the current load.
+ p (dlt.Pipeline): The dlt pipeline instance.
+ table_name (str): Name of the table to clean up backfills for.
+ """
+ # Fetch current load id
+ load_id = load_info.loads_ids[0]
+ pattern = re.compile(rf"{load_id}") # Compile regex pattern for the current load ID
+
+ # Initialize the filesystem client
+ fs_client: FilesystemClient = p.destination.client( # type: ignore
+ p.default_schema, initial_config=p._get_destination_client_initial_config(p.destination)
+ )
+
+ # Construct the table directory path
+ table_dir = os.path.join(fs_client.dataset_path, table_name)
+
+ # Check if the table directory exists
+ if fs_client.fs_client.exists(table_dir):
+ # Traverse the table directory
+ for root, _dirs, files in fs_client.fs_client.walk(table_dir, maxdepth=None):
+ for file in files:
+ # Construct the full file path
+ file_path = os.path.join(root, file)
+ # If the file does not match the current load ID, delete it
+ if not pattern.search(file_path):
+ try:
+ fs_client.fs_client.rm(file_path) # Remove the old backfill file
+ print(f"Deleted old backfill file: {file_path}")
+ except Exception as e:
+ print(f"Error deleting file {file_path}: {e}")
+ else:
+ # Inform if the table directory does not exist
+ print(f"Table directory does not exist: {table_dir}")
+
+
+def load_chess_data():
+ """
+ Sets up and runs the dlt pipeline to load chess game data, then manages backfills.
+ """
+ # Initialize the dlt pipeline with filesystem destination
+ pipeline = dlt.pipeline(
+ pipeline_name="chess_com_data", destination="filesystem", dataset_name="chess_games"
+ )
+
+ # Generate the list of months for the desired date range
+ months = list(generate_months(2023, 1, 2023, 12))
+
+ # Create the source with all specified months
+ source = chess_com_source("MagnusCarlsen", months)
+
+ # Run the pipeline to fetch and load data
+ info = pipeline.run(source)
+ print(info)
+
+ # After the run, delete old backfills for each table to maintain data consistency
+ for month in months:
+ table_name = f"chess_com_games_{month['year']}_{month['month']}"
+ delete_old_backfills(info, pipeline, table_name)
+
+
+if __name__ == "__main__":
+ load_chess_data()
diff --git a/docs/examples/partial_loading/requirements.txt b/docs/examples/partial_loading/requirements.txt
new file mode 100644
index 0000000000..e89e448f62
--- /dev/null
+++ b/docs/examples/partial_loading/requirements.txt
@@ -0,0 +1 @@
+dlt[s3]
\ No newline at end of file
diff --git a/docs/tools/check_embedded_snippets.py b/docs/tools/check_embedded_snippets.py
index b6772f4529..e8399fce6e 100644
--- a/docs/tools/check_embedded_snippets.py
+++ b/docs/tools/check_embedded_snippets.py
@@ -5,10 +5,11 @@
import ast
import subprocess
import argparse
+import shutil
from dataclasses import dataclass
from textwrap import dedent
-from typing import List
+from typing import List, Dict
import tomlkit
import yaml
@@ -24,6 +25,7 @@
LINT_TEMPLATE = "./lint_setup/template.py"
LINT_FILE = "./lint_setup/lint_me.py"
+LINT_FOLDER = "./lint_setup/lint_me"
ENABLE_MYPY = True
@@ -178,16 +180,38 @@ def parse_snippets(snippets: List[Snippet], verbose: bool) -> None:
fmt.note("All snippets could be parsed")
-def prepare_for_linting(snippet: Snippet) -> None:
+def prepare_for_linting(snippets: List[Snippet]) -> None:
"""
Prepare the lintme file with the snippet code and the template header
"""
+
with open(LINT_TEMPLATE, "r", encoding="utf-8") as f:
lint_template = f.read()
- with open(LINT_FILE, "w", encoding="utf-8") as f:
- f.write(lint_template)
- f.write("# Snippet start\n\n")
- f.write(snippet.code)
+
+ # prepare folder
+ shutil.rmtree(LINT_FOLDER, ignore_errors=True)
+
+ # assemble files
+ files: Dict[str, str] = {}
+
+ for snippet in snippets:
+ if snippet.file not in files:
+ files[snippet.file] = lint_template
+ existing = files[snippet.file]
+ existing += "\n\n"
+ existing += f"# Snippet start (Line {snippet.line})\n\n"
+ existing += snippet.code
+ files[snippet.file] = existing
+
+ count = 0
+ for filename, content in files.items():
+ count += 1
+ target_file_name = LINT_FOLDER + filename
+ target_file_name = target_file_name.replace(".md", "").replace("..", "")
+ target_file_name += "_" + str(count) + ".py"
+ os.makedirs(os.path.dirname(target_file_name), exist_ok=True)
+ with open(target_file_name, "w", encoding="utf-8") as f:
+ f.write(content)
def lint_snippets(snippets: List[Snippet], verbose: bool) -> None:
@@ -195,50 +219,36 @@ def lint_snippets(snippets: List[Snippet], verbose: bool) -> None:
Lint all python snippets with ruff
"""
fmt.secho(fmt.bold("Linting Python snippets"))
- failed_count = 0
- count = 0
- for snippet in snippets:
- count += 1
- prepare_for_linting(snippet)
- result = subprocess.run(["ruff", "check", LINT_FILE], capture_output=True, text=True)
- if verbose:
- fmt.echo(f"Linting {snippet} ({count} of {len(snippets)})")
- if "error" in result.stdout.lower():
- failed_count += 1
- fmt.warning(f"Failed to lint {str(snippet)}")
- fmt.echo(result.stdout.strip())
- if failed_count:
- fmt.error(f"Failed to lint {failed_count} snippets")
+ prepare_for_linting(snippets)
+ result = subprocess.run(["ruff", "check", LINT_FOLDER], capture_output=True, text=True)
+
+ if "error" in result.stdout.lower():
+ fmt.echo(result.stdout.strip())
+ fmt.error("Failed to lint some snippets")
exit(1)
- else:
- fmt.note("All snippets could be linted")
+
+ fmt.note("All snippets could be linted")
def typecheck_snippets(snippets: List[Snippet], verbose: bool) -> None:
"""
- TODO: Type check all python snippets with mypy
+ Type check all python snippets with mypy
"""
fmt.secho(fmt.bold("Type checking Python snippets"))
- failed_count = 0
- count = 0
- for snippet in snippets:
- count += 1
- if verbose:
- fmt.echo(f"Type checking {snippet} ({count} of {len(snippets)})")
- prepare_for_linting(snippet)
- result = subprocess.run(["mypy", LINT_FILE], capture_output=True, text=True)
- if "no issues found" not in result.stdout.lower():
- failed_count += 1
- fmt.warning(f"Failed to type check {str(snippet)}")
- fmt.echo(result.stdout.strip())
- fmt.echo(result.stderr.strip())
- if failed_count:
- fmt.error(f"Failed to type check {failed_count} snippets")
+ prepare_for_linting(snippets)
+ result = subprocess.run(
+ ["mypy", LINT_FOLDER, "--check-untyped-defs"], capture_output=True, text=True
+ )
+
+ if "no issues found" not in result.stdout.lower():
+ fmt.echo(result.stdout.strip())
+ fmt.echo(result.stderr.strip())
+ fmt.error("Failed to type check some snippets")
exit(1)
- else:
- fmt.note("All snippets passed type checking")
+
+ fmt.note("All snippets passed type checking")
if __name__ == "__main__":
@@ -310,4 +320,7 @@ def typecheck_snippets(snippets: List[Snippet], verbose: bool) -> None:
if os.path.exists(LINT_FILE):
os.unlink(LINT_FILE)
+ fmt.note("Deleting temporary files...")
+ shutil.rmtree(LINT_FOLDER, ignore_errors=True)
+
fmt.note("All selected checks passed. Snippet Checker 3000 signing off.")
diff --git a/docs/tools/lint_setup/.gitignore b/docs/tools/lint_setup/.gitignore
index 27479bdb04..dbc299319f 100644
--- a/docs/tools/lint_setup/.gitignore
+++ b/docs/tools/lint_setup/.gitignore
@@ -1 +1,2 @@
-lint_me.py
\ No newline at end of file
+lint_me.py
+lint_me
\ No newline at end of file
diff --git a/docs/tools/lint_setup/template.py b/docs/tools/lint_setup/template.py
index bebc0e9ab0..16cef17450 100644
--- a/docs/tools/lint_setup/template.py
+++ b/docs/tools/lint_setup/template.py
@@ -3,34 +3,174 @@
# mypy: disable-error-code="name-defined,import-not-found,import-untyped,empty-body,no-redef"
# some universal imports
-from typing import Optional, Dict, List, Any, Iterable, Iterator, Tuple, Sequence, Callable
+from typing import (
+ Optional,
+ Dict,
+ List,
+ Any,
+ Iterable,
+ Iterator,
+ Tuple,
+ Sequence,
+ Callable,
+ Union,
+ Generator,
+)
import os
+import duckdb
+import urllib
+import itertools
+import airflow
+import datetime # noqa: I251
+import pendulum # noqa: I251
+
-from datetime import datetime # noqa: I251
-from pendulum import DateTime # noqa: I251
+from airflow.decorators import dag
+#
+# various dlt imports used by snippets
+#
import dlt
from dlt.common import json, pendulum
-from dlt.common.typing import TimedeltaSeconds, TAnyDateTime, TDataItem, TDataItems
-from dlt.common.schema.typing import TTableSchema, TTableSchemaColumns
-
+from dlt.common.typing import (
+ TimedeltaSeconds,
+ TAnyDateTime,
+ TDataItem,
+ TDataItems,
+ StrStr,
+ DictStrAny,
+)
+from dlt.common.schema.typing import TTableSchema, TTableSchemaColumns, TColumnSchema
from dlt.common.pipeline import LoadInfo
-from dlt.sources.helpers import requests
-from dlt.extract import DltResource, DltSource
from dlt.common.configuration.specs import (
GcpServiceAccountCredentials,
ConnectionStringCredentials,
OAuth2Credentials,
BaseConfiguration,
+ AwsCredentials,
+ GcpOAuthCredentials,
+ GcpServiceAccountCredentials,
)
+from dlt.common.libs.pyarrow import Table as ArrowTable
+from dlt.common.data_writers import TDataItemFormat
+
+from dlt.extract.source import SourceFactory
+from dlt.extract.items import DataItemWithMeta
+from dlt.extract import DltResource, DltSource
from dlt.common.storages.configuration import FileSystemCredentials
from dlt.pipeline.exceptions import PipelineStepFailed
+from dlt.common.schema import DataValidationError
+
+#
+# dlt core sources
+#
+from dlt.sources.sql_database import sql_database, sql_table, Table
+from dlt.sources.rest_api import RESTAPIConfig, rest_api_resources
+from dlt.sources.helpers.rest_client.paginators import (
+ BasePaginator,
+ SinglePagePaginator,
+ HeaderLinkPaginator,
+ JSONResponseCursorPaginator,
+ OffsetPaginator,
+ PageNumberPaginator,
+)
+from dlt.sources.helpers.rest_client.auth import BearerTokenAuth, AuthConfigBase
+from dlt.sources.helpers import requests
-# some universal variables
+#
+# some universal variables used by snippets
+# NOTE: these are only used for typechecking, setting to None is ok
+#
pipeline: dlt.Pipeline = None # type: ignore[assignment]
p: dlt.Pipeline = None # type: ignore[assignment]
ex: Exception = None # type: ignore[assignment]
load_info: LoadInfo = None # type: ignore[assignment]
url: str = None # type: ignore[assignment]
+resource: DltResource = None # type: ignore[assignment]
+data: List[Any] = None # type: ignore[assignment]
+item: Any = None # type: ignore[assignment]
+arrow_table: ArrowTable = None # type: ignore[assignment]
+
+my_callable: Callable[..., Any] = None # type: ignore[assignment]
+
+# getters for items
+_get_event_pages: Callable[..., Any] = None # type: ignore[assignment]
+_get_rest_pages: Callable[..., Any] = None # type: ignore[assignment]
+_get_issues_page: Callable[..., Any] = None # type: ignore[assignment]
+_get_data: Callable[..., Any] = None # type: ignore[assignment]
+_get_data_chunked: Callable[..., Any] = None # type: ignore[assignment]
+_get_players_archives: Callable[..., Any] = None # type: ignore[assignment]
+_get_paginated: Callable[..., Any] = None # type: ignore[assignment]
+_get_users: Callable[..., Any] = None # type: ignore[assignment]
+_get_orders: Callable[..., Any] = None # type: ignore[assignment]
+_get_users: Callable[..., Any] = None # type: ignore[assignment]
+_get_details: Callable[..., Any] = None # type: ignore[assignment]
+_get_records: Callable[..., Any] = None # type: ignore[assignment]
+_get_sheet: Callable[..., Any] = None # type: ignore[assignment]
+
+# helpers
+_hash_str: Callable[..., Any] = None # type: ignore[assignment]
+_get_batch_from_bucket: Callable[..., Any] = None # type: ignore[assignment]
+_get_primary_key: Callable[..., Any] = None # type: ignore[assignment]
+_get_path_with_retry: Callable[..., Any] = None # type: ignore[assignment]
+
+#
+#
+#
+
+#
+# Some snippet specific constants (NOTE: please only use these if you can't use one of the above)
+#
+SERVER_NAME: str = ""
+DATABASE_NAME: str = ""
+SERVICE_PRINCIPAL_ID: str = ""
+SERVICE_PRINCIPAL_SECRETS: str = ""
+TENANT_ID: str = ""
+REPO_NAME: str = ""
+MAX_PAGE_SIZE: int = 100
+API_VERSION: str = ""
+FIRST_DAY_OF_MILLENNIUM: TAnyDateTime = pendulum.DateTime(2000, 1, 1)
+START_DATE: pendulum.DateTime = pendulum.DateTime(2024, 1, 1)
+END_DATE: pendulum.DateTime = pendulum.DateTime(2024, 12, 31)
+START_DATE_STRING: str = ""
+API_KEY: str = ""
+ITEMS_PER_PAGE: int = 100
+CHUNK_SIZE: int = 500
+ENDPOINTS: List[str] = []
+RESOURCE_URL: str = ""
+BASE_URL: str = ""
+
+# functions
+hash_string: Callable[[str], str] = None # type: ignore[assignment]
+
+# sources
+my_source: DltSource = None # type: ignore[assignment]
+source: DltSource = None # type: ignore[assignment]
+pipedrive_source: SourceFactory[Any, Any] = None # type: ignore[assignment]
+zendesk_support: SourceFactory[Any, Any] = None # type: ignore[assignment]
+facebook_ads_source: SourceFactory[Any, Any] = None # type: ignore[assignment]
+chess_source: SourceFactory[Any, Any] = None # type: ignore[assignment]
+airtable_emojis: SourceFactory[Any, Any] = None # type: ignore[assignment]
+merge_source: SourceFactory[Any, Any] = None # type: ignore[assignment]
+sql_source: SourceFactory[Any, Any] = None # type: ignore[assignment]
+data_source: SourceFactory[Any, Any] = None # type: ignore[assignment]
+
+# resources
my_resource: DltResource = None # type: ignore[assignment]
+source: DltResource = None # type: ignore[assignment]
+incremental_resource: DltResource = None # type: ignore[assignment]
+
+# facebook ads
+DEFAULT_ADCREATIVE_FIELDS: List[str] = []
+
+# docs/website/docs/dlt-ecosystem/verified-sources/asana.md
+PROJECT_FIELDS: List[str] = []
+TASK_FIELDS: List[str] = []
+
+# docs/website/docs/dlt-ecosystem/destinations/weaviate.md
+vectorize: List[str] = []
+tokenization: Dict[str, Any] = {}
+
+# docs/website/docs/dlt-ecosystem/verified-sources/chess.md
+players_online_status: DltResource = None # type: ignore[assignment]
diff --git a/docs/tools/mypy.ini b/docs/tools/mypy.ini
index 167ad5b30e..1921a9b405 100644
--- a/docs/tools/mypy.ini
+++ b/docs/tools/mypy.ini
@@ -1,4 +1,3 @@
[mypy]
-ignore_missing_imports = True
no_implicit_optional = False
-strict_optional = False
\ No newline at end of file
+strict_optional = False
diff --git a/docs/website/docs/build-a-pipeline-tutorial.md b/docs/website/docs/build-a-pipeline-tutorial.md
index 0fe483c944..f85d2e19ea 100644
--- a/docs/website/docs/build-a-pipeline-tutorial.md
+++ b/docs/website/docs/build-a-pipeline-tutorial.md
@@ -290,8 +290,8 @@ pipeline = dlt.pipeline(
with pipeline.sql_client() as client:
with client.execute_query(
'SELECT "reactions__+1", "reactions__-1", reactions__laugh, reactions__hooray, reactions__rocket FROM issues'
- ) as table:
- reactions = table.df()
+ ) as cursor:
+ reactions = cursor.df()
counts = reactions.sum(0).sort_values(0, ascending=False)
```
diff --git a/docs/website/docs/dlt-ecosystem/destinations/athena.md b/docs/website/docs/dlt-ecosystem/destinations/athena.md
index 822d1c9c07..04b2eb22ae 100644
--- a/docs/website/docs/dlt-ecosystem/destinations/athena.md
+++ b/docs/website/docs/dlt-ecosystem/destinations/athena.md
@@ -100,7 +100,9 @@ Athena tables store timestamps with millisecond precision, and with that precisi
Athena does not support JSON fields, so JSON is stored as a string.
-> ❗**Athena does not support TIME columns in parquet files**. `dlt` will fail such jobs permanently. Convert `datetime.time` objects to `str` or `datetime.datetime` to load them.
+:::caution
+**Athena does not support TIME columns in parquet files**. `dlt` will fail such jobs permanently. Convert `datetime.time` objects to `str` or `datetime.datetime` to load them.
+:::
### Table and column identifiers
@@ -137,9 +139,10 @@ For every table created as an Iceberg table, the Athena destination will create
The `merge` write disposition is supported for Athena when using Iceberg tables.
-> Note that:
-> 1. There is a risk of tables ending up in an inconsistent state in case a pipeline run fails mid-flight because Athena doesn't support transactions, and `dlt` uses multiple DELETE/UPDATE/INSERT statements to implement `merge`.
-> 2. `dlt` creates additional helper tables called `insert_
` and `delete_` in the staging schema to work around Athena's lack of temporary tables.
+:::note
+1. There is a risk of tables ending up in an inconsistent state in case a pipeline run fails mid-flight because Athena doesn't support transactions, and `dlt` uses multiple DELETE/UPDATE/INSERT statements to implement `merge`.
+2. `dlt` creates additional helper tables called `insert_` and `delete_` in the staging schema to work around Athena's lack of temporary tables.
+:::
### dbt support
@@ -156,8 +159,7 @@ aws_data_catalog="awsdatacatalog"
## Supported file formats
-You can choose the following file formats:
-* [parquet](../file-formats/parquet.md) is used by default
+* [Parquet](../file-formats/parquet.md) is used by default.
## Athena adapter
@@ -201,7 +203,7 @@ def partitioned_data():
# Add partitioning hints to the table
athena_adapter(
- partitioned_table,
+ partitioned_data,
partition=[
# Partition per category and month
"category",
diff --git a/docs/website/docs/dlt-ecosystem/destinations/bigquery.md b/docs/website/docs/dlt-ecosystem/destinations/bigquery.md
index 9dc983bc33..25ae5c8bf9 100644
--- a/docs/website/docs/dlt-ecosystem/destinations/bigquery.md
+++ b/docs/website/docs/dlt-ecosystem/destinations/bigquery.md
@@ -146,8 +146,8 @@ this moment (they are stored as JSON), may be created. You can select certain re
[destination.bigquery]
autodetect_schema=true
```
-We recommend yielding [arrow tables](../verified-sources/arrow-pandas.md) from your resources and using the `parquet` file format to load the data. In that case, the schemas generated by `dlt` and BigQuery
-will be identical. BigQuery will also preserve the column order from the generated parquet files. You can convert `json` data into arrow tables with [pyarrow or duckdb](../verified-sources/arrow-pandas.md#loading-json-documents).
+We recommend yielding [Arrow tables](../verified-sources/arrow-pandas.md) from your resources and using the Parquet file format to load the data. In that case, the schemas generated by `dlt` and BigQuery
+will be identical. BigQuery will also preserve the column order from the generated parquet files. You can convert JSON data into Arrow tables with [pyarrow or duckdb](../verified-sources/arrow-pandas.md#loading-json-documents).
```py
import pyarrow.json as paj
@@ -187,7 +187,7 @@ pipeline.run(
In the example below, we represent JSON data as tables up to nesting level 1. Above this nesting level, we let BigQuery create nested fields.
:::caution
-If you yield data as Python objects (dicts) and load this data as `parquet`, the nested fields will be converted into strings. This is one of the consequences of
+If you yield data as Python objects (dicts) and load this data as Parquet, the nested fields will be converted into strings. This is one of the consequences of
`dlt` not being able to infer nested fields.
:::
@@ -195,17 +195,17 @@ If you yield data as Python objects (dicts) and load this data as `parquet`, the
You can configure the following file formats to load data to BigQuery:
-* [jsonl](../file-formats/jsonl.md) is used by default.
-* [parquet](../file-formats/parquet.md) is supported.
+* [JSONL](../file-formats/jsonl.md) is used by default.
+* [Parquet](../file-formats/parquet.md) is supported.
When staging is enabled:
-* [jsonl](../file-formats/jsonl.md) is used by default.
-* [parquet](../file-formats/parquet.md) is supported.
+* [JSONL](../file-formats/jsonl.md) is used by default.
+* [Parquet](../file-formats/parquet.md) is supported.
:::caution
**BigQuery cannot load JSON columns from Parquet files**. `dlt` will fail such jobs permanently. Instead:
-* Switch to `jsonl` to load and parse JSON properly.
+* Switch to JSONL to load and parse JSON properly.
* Use schema [autodetect and nested fields](#use-bigquery-schema-autodetect-for-nested-fields)
:::
@@ -304,7 +304,6 @@ The adapter updates the DltResource with metadata about the destination column a
Here is an example of how to use the `bigquery_adapter` method to apply hints to a resource on both the column level and table level:
```py
-from datetime import date, timedelta
import dlt
from dlt.destinations.adapters import bigquery_adapter
@@ -319,7 +318,7 @@ from dlt.destinations.adapters import bigquery_adapter
)
def event_data():
yield from [
- {"event_date": date.today() + timedelta(days=i)} for i in range(100)
+ {"event_date": datetime.date.today() + datetime.timedelta(days=i)} for i in range(100)
]
@@ -344,7 +343,8 @@ Some things to note with the adapter's behavior:
- You can cluster on as many columns as you would like.
- Sequential adapter calls on the same resource accumulate parameters, akin to an OR operation, for a unified execution.
-> ❗ At the time of writing, table level options aren't supported for `ALTER` operations.
+:::caution
+At the time of writing, table level options aren't supported for `ALTER` operations.
Note that `bigquery_adapter` updates the resource *in place*, but returns the resource for convenience, i.e., both the following are valid:
@@ -354,6 +354,7 @@ my_resource = bigquery_adapter(my_resource, partition="partition_column_name")
```
Refer to the [full API specification](../../api_reference/destinations/impl/bigquery/bigquery_adapter) for more details.
+:::
diff --git a/docs/website/docs/dlt-ecosystem/destinations/clickhouse.md b/docs/website/docs/dlt-ecosystem/destinations/clickhouse.md
index 8f4595b814..5ca25af55c 100644
--- a/docs/website/docs/dlt-ecosystem/destinations/clickhouse.md
+++ b/docs/website/docs/dlt-ecosystem/destinations/clickhouse.md
@@ -24,7 +24,7 @@ Let's start by initializing a new `dlt` project as follows:
dlt init chess clickhouse
```
-> 💡 This command will initialize your pipeline with chess as the source and ClickHouse as the destination.
+`dlt init` command will initialize your pipeline with chess as the source and ClickHouse as the destination.
The above command generates several files and directories, including `.dlt/secrets.toml` and a requirements file for ClickHouse. You can install the necessary dependencies specified in the requirements file by executing it as follows:
@@ -118,29 +118,33 @@ Data is loaded into ClickHouse using the most efficient method depending on the
## Datasets
-`Clickhouse` does not support multiple datasets in one database; dlt relies on datasets to exist for multiple reasons.
-To make `clickhouse` work with `dlt`, tables generated by `dlt` in your `clickhouse` database will have their names prefixed with the dataset name, separated by
+ClickHouse does not support multiple datasets in one database; dlt relies on datasets to exist for multiple reasons.
+To make ClickHouse work with `dlt`, tables generated by `dlt` in your ClickHouse database will have their names prefixed with the dataset name, separated by
the configurable `dataset_table_separator`.
Additionally, a special sentinel table that doesn't contain any data will be created, so dlt knows which virtual datasets already exist in a
clickhouse
destination.
+:::tip
+`dataset_name` is optional for Clikchouse. When skipped `dlt` will create all tables without prefix. Note that staging dataset
+tables will still be prefixed with `_staging` (or other name that you configure).
+:::
+
## Supported file formats
-- [jsonl](../file-formats/jsonl.md) is the preferred format for both direct loading and staging.
-- [parquet](../file-formats/parquet.md) is supported for both direct loading and staging.
+- [JSONL](../file-formats/jsonl.md) is the preferred format for both direct loading and staging.
+- [Parquet](../file-formats/parquet.md) is supported for both direct loading and staging.
The `clickhouse` destination has a few specific deviations from the default SQL destinations:
-1. `Clickhouse` has an experimental `object` datatype, but we've found it to be a bit unpredictable, so the dlt clickhouse destination will load the `json` datatype to a `text` column.
+1. ClickHouse has an experimental `object` datatype, but we've found it to be a bit unpredictable, so the dlt `clickhouse` destination will load the `json` datatype to a `text` column.
If you need
this feature, get in touch with our Slack community, and we will consider adding it.
-2. `Clickhouse` does not support the `time` datatype. Time will be loaded to a `text` column.
-3. `Clickhouse` does not support the `binary` datatype. Binary will be loaded to a `text` column. When loading from `jsonl`, this will be a base64 string; when loading from parquet, this will be
+2. ClickHouse does not support the `time` datatype. Time will be loaded to a `text` column.
+3. ClickHouse does not support the `binary` datatype. Binary will be loaded to a `text` column. When loading from JSONL, this will be a base64 string; when loading from parquet, this will be
the `binary` object converted to `text`.
-4. `Clickhouse` accepts adding columns to a populated table that aren’t null.
-5. `Clickhouse` can produce rounding errors under certain conditions when using the float/double datatype. Make sure to use decimal if you can’t afford to have rounding errors. Loading the value
- 12.7001 to a double column with the loader file format jsonl set will predictably produce a rounding error, for example.
+4. ClickHouse accepts adding columns to a populated table that aren’t null.
+5. ClickHouse can produce rounding errors under certain conditions when using the float/double datatype. Make sure to use decimal if you can’t afford to have rounding errors. Loading the value 12.7001 to a double column with the loader file format jsonl set will predictably produce a rounding error, for example.
## Supported column hints
diff --git a/docs/website/docs/dlt-ecosystem/destinations/databricks.md b/docs/website/docs/dlt-ecosystem/destinations/databricks.md
index e484e64aed..513a3b792f 100644
--- a/docs/website/docs/dlt-ecosystem/destinations/databricks.md
+++ b/docs/website/docs/dlt-ecosystem/destinations/databricks.md
@@ -128,21 +128,22 @@ See [staging support](#staging-support) for authentication options when `dlt` co
All write dispositions are supported.
## Data loading
-Data is loaded using `INSERT VALUES` statements by default.
+To load data into Databricks, you must set up a staging filesystem by configuring an Amazon S3 or Azure Blob Storage bucket. Parquet is the default file format used for data uploads. As an alternative to Parquet, you can switch to using JSONL.
+
+dlt will upload the data in Parquet files (or JSONL, if configured) to the bucket and then use `COPY INTO` statements to ingest the data into Databricks.
-Efficient loading from a staging filesystem is also supported by configuring an Amazon S3 or Azure Blob Storage bucket as a staging destination. When staging is enabled, `dlt` will upload data in `parquet` files to the bucket and then use `COPY INTO` statements to ingest the data into Databricks.
For more information on staging, see the [staging support](#staging-support) section below.
+
## Supported file formats
-* [insert-values](../file-formats/insert-format.md) is used by default.
-* [JSONL](../file-formats/jsonl.md) supported when staging is enabled (see limitations below).
* [Parquet](../file-formats/parquet.md) supported when staging is enabled.
+* [JSONL](../file-formats/jsonl.md) supported when staging is enabled (see limitations below).
The JSONL format has some limitations when used with Databricks:
1. Compression must be disabled to load jsonl files in Databricks. Set `data_writer.disable_compression` to `true` in the dlt config when using this format.
2. The following data types are not supported when using the JSONL format with `databricks`: `decimal`, `json`, `date`, `binary`. Use `parquet` if your data contains these types.
-3. The `bigint` data type with precision is not supported with the `jsonl` format.
+3. The `bigint` data type with precision is not supported with the JSONL format.
## Staging support
@@ -224,11 +225,17 @@ import dlt
bricks = dlt.destinations.databricks(staging_credentials_name="credential_x")
```
+## Additional destination capabilities
+
### dbt support
This destination [integrates with dbt](../transformations/dbt/dbt.md) via [dbt-databricks](https://github.com/databricks/dbt-databricks).
### Syncing of `dlt` state
This destination fully supports [dlt state sync](../../general-usage/state#syncing-state-with-destination).
+### Databricks User Agent
+We enable Databricks to identify that the connection is created by dlt.
+Databricks will use this user agent identifier to better understand the usage patterns associated with dlt integration. The connection identifier is `dltHub_dlt`.
+
diff --git a/docs/website/docs/dlt-ecosystem/destinations/destination.md b/docs/website/docs/dlt-ecosystem/destinations/destination.md
index a7f7c5fe16..695d1affe7 100644
--- a/docs/website/docs/dlt-ecosystem/destinations/destination.md
+++ b/docs/website/docs/dlt-ecosystem/destinations/destination.md
@@ -90,7 +90,7 @@ The destination decorator supports settings and secrets variables. If you, for e
```py
@dlt.destination(batch_size=10, loader_file_format="jsonl", name="my_destination")
-def my_destination(items: TDataItems, table: TTableSchema, api_key: dlt.secrets.value) -> None:
+def my_destination(items: TDataItems, table: TTableSchema, api_key: str = dlt.secrets.value) -> None:
...
```
@@ -121,10 +121,10 @@ There are multiple ways to pass the custom destination function to the `dlt` pip
explicitly to the destination function.
```py
@dlt.destination(batch_size=10, loader_file_format="jsonl", name="my_destination")
- def my_destination(items: TDataItems, table: TTableSchema, api_key: dlt.secrets.value) -> None:
+ def my_destination(items: TDataItems, table: TTableSchema, api_key: str = dlt.secrets.value) -> None:
...
- p = dlt.pipeline("my_pipe", destination=my_destination(api_key=os.getenv("MY_API_KEY")))
+ p = dlt.pipeline("my_pipe", destination=my_destination(api_key=os.getenv("API_KEY"))) # type: ignore[call-arg]
```
- Directly via destination reference. In this case, don't use the decorator for the destination function.
diff --git a/docs/website/docs/dlt-ecosystem/destinations/dremio.md b/docs/website/docs/dlt-ecosystem/destinations/dremio.md
index 792e1f1c79..189289db21 100644
--- a/docs/website/docs/dlt-ecosystem/destinations/dremio.md
+++ b/docs/website/docs/dlt-ecosystem/destinations/dremio.md
@@ -74,13 +74,17 @@ profile_name="dlt-ci-user"
- `replace`
- `merge`
-> The `merge` write disposition uses the default DELETE/UPDATE/INSERT strategy to merge data into the destination. Be aware that Dremio does not support transactions, so a partial pipeline failure can result in the destination table being in an inconsistent state. The `merge` write disposition will eventually be implemented using [MERGE INTO](https://docs.dremio.com/current/reference/sql/commands/apache-iceberg-tables/apache-iceberg-merge/) to resolve this issue.
+:::note
+The `merge` write disposition uses the default DELETE/UPDATE/INSERT strategy to merge data into the destination. Be aware that Dremio does not support transactions, so a partial pipeline failure can result in the destination table being in an inconsistent state. The `merge` write disposition will eventually be implemented using [MERGE INTO](https://docs.dremio.com/current/reference/sql/commands/apache-iceberg-tables/apache-iceberg-merge/) to resolve this issue.
+:::
## Data loading
Data loading happens by copying staged parquet files from an object storage bucket to the destination table in Dremio using [COPY INTO](https://docs.dremio.com/cloud/reference/sql/commands/copy-into-table/) statements. The destination table format is specified by the storage format for the data source in Dremio. Typically, this will be Apache Iceberg.
-> ❗ **Dremio cannot load `fixed_len_byte_array` columns from `parquet` files**.
+:::caution
+Dremio cannot load `fixed_len_byte_array` columns from Parquet files.
+:::
## Dataset creation
diff --git a/docs/website/docs/dlt-ecosystem/destinations/duckdb.md b/docs/website/docs/dlt-ecosystem/destinations/duckdb.md
index 46290f928e..2b284e991a 100644
--- a/docs/website/docs/dlt-ecosystem/destinations/duckdb.md
+++ b/docs/website/docs/dlt-ecosystem/destinations/duckdb.md
@@ -33,7 +33,7 @@ python3 chess_pipeline.py
All write dispositions are supported.
## Data loading
-`dlt` will load data using large INSERT VALUES statements by default. Loading is multithreaded (20 threads by default). If you are okay with installing `pyarrow`, we suggest switching to `parquet` as the file format. Loading is faster (and also multithreaded).
+`dlt` will load data using large INSERT VALUES statements by default. Loading is multithreaded (20 threads by default). If you are okay with installing `pyarrow`, we suggest switching to Parquet as the file format. Loading is faster (and also multithreaded).
### Data types
`duckdb` supports various [timestamp types](https://duckdb.org/docs/sql/data_types/timestamp.html). These can be configured using the column flags `timezone` and `precision` in the `dlt.resource` decorator or the `pipeline.run` method.
@@ -95,11 +95,11 @@ dlt.config["schema.naming"] = "duck_case"
## Supported file formats
You can configure the following file formats to load data into duckdb:
* [insert-values](../file-formats/insert-format.md) is used by default.
-* [parquet](../file-formats/parquet.md) is supported.
+* [Parquet](../file-formats/parquet.md) is supported.
:::note
-`duckdb` cannot COPY many parquet files to a single table from multiple threads. In this situation, `dlt` serializes the loads. Still, that may be faster than INSERT.
+`duckdb` cannot COPY many Parquet files to a single table from multiple threads. In this situation, dlt serializes the loads. Still, that may be faster than INSERT.
:::
-* [jsonl](../file-formats/jsonl.md)
+* [JSONL](../file-formats/jsonl.md)
:::tip
`duckdb` has [timestamp types](https://duckdb.org/docs/sql/data_types/timestamp.html) with resolutions from milliseconds to nanoseconds. However,
diff --git a/docs/website/docs/dlt-ecosystem/destinations/filesystem.md b/docs/website/docs/dlt-ecosystem/destinations/filesystem.md
index d36760bcfd..4e9bf1068e 100644
--- a/docs/website/docs/dlt-ecosystem/destinations/filesystem.md
+++ b/docs/website/docs/dlt-ecosystem/destinations/filesystem.md
@@ -377,11 +377,11 @@ The filesystem destination handles the write dispositions as follows:
- `replace` - all files that belong to such tables are deleted from the dataset folder, and then the current set of files is added.
- `merge` - falls back to `append`
-### 🧪 Merge with delta table format
-The [`upsert`](../../general-usage/incremental-loading.md#upsert-strategy) merge strategy is supported when using the [`delta`](#delta-table-format) table format.
+### Merge with Delta table format (experimental)
+The [`upsert`](../../general-usage/incremental-loading.md#upsert-strategy) merge strategy is supported when using the [Delta table format](#delta-table-format).
:::caution
-The `upsert` merge strategy for the `filesystem` destination with `delta` table format is considered experimental.
+The `upsert` merge strategy for the filesystem destination with Delta table format is experimental.
:::
```py
@@ -612,14 +612,14 @@ Adopting this layout offers several advantages:
## Supported file formats
You can choose the following file formats:
-* [jsonl](../file-formats/jsonl.md) is used by default
-* [parquet](../file-formats/parquet.md) is supported
-* [csv](../file-formats/csv.md) is supported
+* [JSONL](../file-formats/jsonl.md) is used by default
+* [Parquet](../file-formats/parquet.md) is supported
+* [CSV](../file-formats/csv.md) is supported
## Supported table formats
You can choose the following table formats:
-* [Delta](../table-formats/delta.md) is supported
+* [Delta table](../table-formats/delta.md) is supported
### Delta table format
@@ -643,7 +643,13 @@ def my_delta_resource():
...
```
-> `dlt` always uses `parquet` as `loader_file_format` when using the `delta` table format. Any setting of `loader_file_format` is disregarded.
+:::note
+`dlt` always uses Parquet as `loader_file_format` when using the `delta` table format. Any setting of `loader_file_format` is disregarded.
+:::
+
+:::caution
+Beware that when loading a large amount of data for one table, the underlying rust implementation will consume a lot of memory. This is a known issue and the maintainers are actively working on a solution. You can track the progress [here](https://github.com/delta-io/delta-rs/pull/2289). Until the issue is resolved, you can mitigate the memory consumption by doing multiple smaller incremental pipeline runs.
+:::
#### Delta table partitioning
A Delta table can be partitioned ([Hive-style partitioning](https://delta.io/blog/pros-cons-hive-style-partionining/)) by specifying one or more `partition` column hints. This example partitions the Delta table by the `foo` column:
@@ -695,7 +701,7 @@ delta_tables["another_delta_table"].optimize.z_order(["col_a", "col_b"])
```
-## Syncing of `dlt` state
+## Syncing of dlt state
This destination fully supports [dlt state sync](../../general-usage/state#syncing-state-with-destination). To this end, special folders and files will be created at your destination which hold information about your pipeline state, schemas, and completed loads. These folders DO NOT respect your settings in the layout section. When using filesystem as a staging destination, not all of these folders are created, as the state and schemas are managed in the regular way by the final destination you have configured.
You will also notice `init` files being present in the root folder and the special `dlt` folders. In the absence of the concepts of schemas and tables in blob storages and directories, `dlt` uses these special files to harmonize the behavior of the `filesystem` destination with the other implemented destinations.
@@ -709,14 +715,14 @@ When a load generates a new state, for example when using incremental loads, a n
When running your pipeline, you might encounter an error like `[Errno 36] File name too long Error`. This error occurs because the generated file name exceeds the maximum allowed length on your filesystem.
To prevent the file name length error, set the `max_identifier_length` parameter for your destination. This truncates all identifiers (including filenames) to a specified maximum length.
-For example:
+For example:
```py
-from dlt.destinations import duckdb
+from dlt.destinations import duckdb as duckdb_destination
pipeline = dlt.pipeline(
pipeline_name="your_pipeline_name",
- destination=duckdb(
+ destination=duckdb_destination(
max_identifier_length=200, # Adjust the length as needed
),
)
diff --git a/docs/website/docs/dlt-ecosystem/destinations/lancedb.md b/docs/website/docs/dlt-ecosystem/destinations/lancedb.md
index 083d196aea..b2aec665ab 100644
--- a/docs/website/docs/dlt-ecosystem/destinations/lancedb.md
+++ b/docs/website/docs/dlt-ecosystem/destinations/lancedb.md
@@ -123,7 +123,7 @@ Out of the box, LanceDB will act as a normal database. To use LanceDB's embeddin
The `lancedb_adapter` is a helper function that configures the resource for the LanceDB destination:
```py
-lancedb_adapter(data, embed)
+lancedb_adapter(data, embed="title")
```
It accepts the following arguments:
@@ -179,19 +179,61 @@ info = pipeline.run(
### Merge
-The [merge](../../general-usage/incremental-loading.md) write disposition merges the data from the resource with the data at the destination based on a unique identifier.
+The [merge](../../general-usage/incremental-loading.md) write disposition merges the data from the resource with the data at the destination based on a unique identifier. The LanceDB destination merge write disposition only supports upsert strategy. This updates existing records and inserts new ones based on a unique identifier.
+
+You can specify the merge disposition, primary key, and merge key either in a resource or adapter:
+
+```py
+@dlt.resource(
+ primary_key=["doc_id", "chunk_id"],
+ merge_key=["doc_id"],
+ write_disposition={"disposition": "merge", "strategy": "upsert"},
+)
+def my_rag_docs(
+ data: List[DictStrAny],
+) -> Generator[List[DictStrAny], None, None]:
+ yield data
+```
+
+Or:
+
+```py
+pipeline.run(
+ lancedb_adapter(
+ my_new_rag_docs,
+ merge_key="doc_id"
+ ),
+ write_disposition={"disposition": "merge", "strategy": "upsert"},
+ primary_key=["doc_id", "chunk_id"],
+)
+```
+
+The `primary_key` uniquely identifies each record, typically comprising a document ID and a chunk ID.
+The `merge_key`, which cannot be compound, should correspond to the canonical `doc_id` used in vector databases and represent the document identifier in your data model.
+It must be the first element of the `primary_key`.
+This `merge_key` is crucial for document identification and orphan removal during merge operations.
+This structure ensures proper record identification and maintains consistency with vector database concepts.
+
+
+#### Orphan Removal
+
+LanceDB **automatically removes orphaned chunks** when updating or deleting parent documents during a merge operation. To disable this feature:
```py
pipeline.run(
lancedb_adapter(
movies,
embed="title",
+ no_remove_orphans=True # Disable with the `no_remove_orphans` flag.
),
- write_disposition="merge",
- primary_key="id",
+ write_disposition={"disposition": "merge", "strategy": "upsert"},
+ primary_key=["doc_id", "chunk_id"],
)
```
+Note: While it's possible to omit the `merge_key` for brevity (in which case it is assumed to be the first entry of `primary_key`),
+explicitly specifying both is recommended for clarity.
+
### Append
This is the default disposition. It will append the data to the existing data in the destination.
@@ -200,7 +242,6 @@ This is the default disposition. It will append the data to the existing data in
- `dataset_separator`: The character used to separate the dataset name from table names. Defaults to "___".
- `vector_field_name`: The name of the special field to store vector embeddings. Defaults to "vector".
-- `id_field_name`: The name of the special field used for deduplication and merging. Defaults to "id__".
- `max_retries`: The maximum number of retries for embedding operations. Set to 0 to disable retries. Defaults to 3.
## dbt support
diff --git a/docs/website/docs/dlt-ecosystem/destinations/qdrant.md b/docs/website/docs/dlt-ecosystem/destinations/qdrant.md
index c59c4e8bb2..da361ca3a0 100644
--- a/docs/website/docs/dlt-ecosystem/destinations/qdrant.md
+++ b/docs/website/docs/dlt-ecosystem/destinations/qdrant.md
@@ -87,7 +87,7 @@ To use vector search after the data has been loaded, you must specify which fiel
The `qdrant_adapter` is a helper function that configures the resource for the Qdrant destination:
```py
-qdrant_adapter(data, embed)
+qdrant_adapter(data, embed="title")
```
It accepts the following arguments:
diff --git a/docs/website/docs/dlt-ecosystem/destinations/redshift.md b/docs/website/docs/dlt-ecosystem/destinations/redshift.md
index a3409ec639..3108004712 100644
--- a/docs/website/docs/dlt-ecosystem/destinations/redshift.md
+++ b/docs/website/docs/dlt-ecosystem/destinations/redshift.md
@@ -75,16 +75,18 @@ All [write dispositions](../../general-usage/incremental-loading#choosing-a-writ
[SQL Insert](../file-formats/insert-format) is used by default.
When staging is enabled:
-* [jsonl](../file-formats/jsonl.md) is used by default.
-* [parquet](../file-formats/parquet.md) is supported.
+* [JSONL](../file-formats/jsonl.md) is used by default.
+* [Parquet](../file-formats/parquet.md) is supported.
-> ❗ **Redshift cannot load `VARBYTE` columns from `json` files**. `dlt` will fail such jobs permanently. Switch to `parquet` to load binaries.
+:::caution
+- **Redshift cannot load `VARBYTE` columns from JSON files**. `dlt` will fail such jobs permanently. Switch to Parquet to load binaries.
-> ❗ **Redshift cannot load `TIME` columns from `json` or `parquet` files**. `dlt` will fail such jobs permanently. Switch to direct `insert_values` to load time columns.
+- **Redshift cannot load `TIME` columns from JSON or Parquet files**. `dlt` will fail such jobs permanently. Switch to direct `insert_values` to load time columns.
-> ❗ **Redshift cannot detect compression type from `json` files**. `dlt` assumes that `jsonl` files are gzip compressed, which is the default.
+- **Redshift cannot detect compression type from JSON files**. `dlt` assumes that JSONL files are gzip compressed, which is the default.
-> ❗ **Redshift loads `json` types as strings into SUPER with `parquet`**. Use `jsonl` format to store JSON in SUPER natively or transform your SUPER columns with `PARSE_JSON`.
+- **Redshift loads JSON types as strings into SUPER with Parquet**. Use JSONL format to store JSON in SUPER natively or transform your SUPER columns with `PARSE_JSON`.
+:::
## Supported column hints
@@ -147,7 +149,7 @@ pipeline = dlt.pipeline(
## Supported loader file formats
-Supported loader file formats for Redshift are `sql` and `insert_values` (default). When using a staging location, Redshift supports `parquet` and `jsonl`.
+Supported loader file formats for Redshift are `sql` and `insert_values` (default). When using a staging location, Redshift supports Parquet and JSONL.
diff --git a/docs/website/docs/dlt-ecosystem/destinations/snowflake.md b/docs/website/docs/dlt-ecosystem/destinations/snowflake.md
index 084ecb9a62..07cf822973 100644
--- a/docs/website/docs/dlt-ecosystem/destinations/snowflake.md
+++ b/docs/website/docs/dlt-ecosystem/destinations/snowflake.md
@@ -170,17 +170,17 @@ pipeline.run(events())
## Supported file formats
* [insert-values](../file-formats/insert-format.md) is used by default.
-* [parquet](../file-formats/parquet.md) is supported.
-* [jsonl](../file-formats/jsonl.md) is supported.
-* [csv](../file-formats/csv.md) is supported.
+* [Parquet](../file-formats/parquet.md) is supported.
+* [JSONL](../file-formats/jsonl.md) is supported.
+* [CSV](../file-formats/csv.md) is supported.
When staging is enabled:
-* [jsonl](../file-formats/jsonl.md) is used by default.
-* [parquet](../file-formats/parquet.md) is supported.
-* [csv](../file-formats/csv.md) is supported.
+* [JSONL](../file-formats/jsonl.md) is used by default.
+* [Parquet](../file-formats/parquet.md) is supported.
+* [CSV](../file-formats/csv.md) is supported.
:::caution
-When loading from `parquet`, Snowflake will store `json` types (JSON) in `VARIANT` as a string. Use the `jsonl` format instead or use `PARSE_JSON` to update the `VARIANT` field after loading.
+When loading from Parquet, Snowflake will store `json` types (JSON) in `VARIANT` as a string. Use the JSONL format instead or use `PARSE_JSON` to update the `VARIANT` field after loading.
:::
### Custom CSV formats
diff --git a/docs/website/docs/dlt-ecosystem/destinations/sqlalchemy.md b/docs/website/docs/dlt-ecosystem/destinations/sqlalchemy.md
index 9f33c02337..487f3b73f9 100644
--- a/docs/website/docs/dlt-ecosystem/destinations/sqlalchemy.md
+++ b/docs/website/docs/dlt-ecosystem/destinations/sqlalchemy.md
@@ -154,7 +154,7 @@ For example, SQLite does not have `DATETIME` or `TIMESTAMP` types, so `timestamp
## Supported file formats
* [typed-jsonl](../file-formats/jsonl.md) is used by default. JSON-encoded data with typing information included.
-* [parquet](../file-formats/parquet.md) is supported.
+* [Parquet](../file-formats/parquet.md) is supported.
## Supported column hints
diff --git a/docs/website/docs/dlt-ecosystem/destinations/synapse.md b/docs/website/docs/dlt-ecosystem/destinations/synapse.md
index 043d937d06..c097701c47 100644
--- a/docs/website/docs/dlt-ecosystem/destinations/synapse.md
+++ b/docs/website/docs/dlt-ecosystem/destinations/synapse.md
@@ -99,16 +99,18 @@ To use **Active Directory Principal**, you can use the `sqlalchemy.engine.URL.cr
```py
conn_str = (
f"DRIVER={{ODBC Driver 18 for SQL Server}};"
- f"SERVER={server_name};"
- f"DATABASE={database_name};"
- f"UID={service_principal_id}@{tenant_id};"
- f"PWD={service_principal_secret};"
+ f"SERVER={SERVER_NAME};"
+ f"DATABASE={DATABASE_NAME};"
+ f"UID={SERVICE_PRINCIPAL_ID}@{TENANT_ID};"
+ f"PWD={SERVICE_PRINCIPAL_SECRETS};"
f"Authentication=ActiveDirectoryServicePrincipal"
)
```
Next, create the connection URL:
```py
+from sqlalchemy.engine import URL
+
connection_url = URL.create(
"mssql+pyodbc",
query={"odbc_connect": conn_str}
@@ -138,10 +140,10 @@ Data is loaded via `INSERT` statements by default.
## Supported file formats
* [insert-values](../file-formats/insert-format.md) is used by default
-* [parquet](../file-formats/parquet.md) is used when [staging](#staging-support) is enabled
+* [Parquet](../file-formats/parquet.md) is used when [staging](#staging-support) is enabled
## Data type limitations
-* **Synapse cannot load `TIME` columns from `parquet` files**. `dlt` will fail such jobs permanently. Use the `insert_values` file format instead, or convert `datetime.time` objects to `str` or `datetime.datetime` to load `TIME` columns.
+* **Synapse cannot load `TIME` columns from Parquet files**. `dlt` will fail such jobs permanently. Use the `insert_values` file format instead, or convert `datetime.time` objects to `str` or `datetime.datetime` to load `TIME` columns.
* **Synapse does not have a nested/JSON/struct data type**. The `dlt` `json` data type is mapped to the `nvarchar` type in Synapse.
## Table index type
@@ -152,7 +154,7 @@ from dlt.destinations.adapters import synapse_adapter
info = pipeline.run(
synapse_adapter(
- data=your_resource,
+ data=my_resource,
table_index_type="clustered_columnstore_index",
)
)
diff --git a/docs/website/docs/dlt-ecosystem/file-formats/csv.md b/docs/website/docs/dlt-ecosystem/file-formats/csv.md
index 687ae3085c..467499a896 100644
--- a/docs/website/docs/dlt-ecosystem/file-formats/csv.md
+++ b/docs/website/docs/dlt-ecosystem/file-formats/csv.md
@@ -11,7 +11,7 @@ import SetTheFormat from './_set_the_format.mdx';
`dlt` uses it for specific use cases - mostly for performance and compatibility reasons.
Internally, we use two implementations:
-- **pyarrow** csv writer - a very fast, multithreaded writer for [arrow tables](../verified-sources/arrow-pandas.md)
+- **pyarrow** CSV writer - a very fast, multithreaded writer for [Arrow tables](../verified-sources/arrow-pandas.md)
- **python stdlib writer** - a csv writer included in the Python standard library for Python objects
## Supported destinations
diff --git a/docs/website/docs/dlt-ecosystem/notebooks.md b/docs/website/docs/dlt-ecosystem/notebooks.md
new file mode 100644
index 0000000000..4486b81b68
--- /dev/null
+++ b/docs/website/docs/dlt-ecosystem/notebooks.md
@@ -0,0 +1,27 @@
+---
+title: dlt in notebooks
+description: Run dlt in notebooks like Colab, Databricks or Jupyter
+keywords: [notebook, jupyter]
+---
+# dlt in notebooks
+
+## Colab
+You'll need to install `dlt` like any other dependency:
+```sh
+!pip install dlt
+```
+
+You can configure secrets using **Secrets** sidebar. Just create a variable with the name `secrets.toml` and paste
+the content of the **toml** file from your `.dlt` folder into it. We support `config.toml` variable as well.
+
+:::note
+`dlt` will not reload the secrets automatically. Please restart your interpreter in Colab options when you add/change
+content of the variables above.
+:::
+
+## Streamlit
+`dlt` will look for `secrets.toml` and `config.toml` in the `.dlt` folder. If `secrets.toml` are not found, it will use
+`secrets.toml` from `.streamlit` folder.
+If you run locally, maintain your usual `.dlt` folder. When running on streamlit cloud, paste the content of `dlt`
+`secrets.toml` into the `streamlit` secrets.
+
diff --git a/docs/website/docs/dlt-ecosystem/staging.md b/docs/website/docs/dlt-ecosystem/staging.md
index 7757d3a90d..dcc4313547 100644
--- a/docs/website/docs/dlt-ecosystem/staging.md
+++ b/docs/website/docs/dlt-ecosystem/staging.md
@@ -48,7 +48,7 @@ Currently, only one destination, the [filesystem](destinations/filesystem.md), c
6. [Snowflake](destinations/snowflake.md#staging-support)
### How to use
-In essence, you need to set up two destinations and then pass them to `dlt.pipeline`. Below, we'll use `filesystem` staging with `parquet` files to load into the `Redshift` destination.
+In essence, you need to set up two destinations and then pass them to `dlt.pipeline`. Below, we'll use `filesystem` staging with [Parquet](./file-formats/parquet) files to load into the `redshift` destination.
1. **Set up the S3 bucket and filesystem staging.**
@@ -74,7 +74,7 @@ In essence, you need to set up two destinations and then pass them to `dlt.pipel
By default, `dlt` will forward the credentials configured for `filesystem` to the `Redshift` COPY command. If you are fine with this, move to the next step.
-4. **Chain staging to destination and request `parquet` file format.**
+4. **Chain staging to destination and request Parquet file format.**
Pass the `staging` argument to `dlt.pipeline`. It works like the destination `argument`:
```py
@@ -88,9 +88,9 @@ In essence, you need to set up two destinations and then pass them to `dlt.pipel
dataset_name='player_data'
)
```
- `dlt` will automatically select an appropriate loader file format for the staging files. Below, we explicitly specify the `parquet` file format (just to demonstrate how to do it):
+ `dlt` will automatically select an appropriate loader file format for the staging files. Below, we explicitly specify the Parquet file format (just to demonstrate how to do it):
```py
- info = pipeline.run(chess(), loader_file_format="parquet")
+ info = pipeline.run(chess_source(), loader_file_format="parquet")
```
5. **Run the pipeline script.**
diff --git a/docs/website/docs/dlt-ecosystem/transformations/pandas.md b/docs/website/docs/dlt-ecosystem/transformations/pandas.md
index cda4855268..e431313d1c 100644
--- a/docs/website/docs/dlt-ecosystem/transformations/pandas.md
+++ b/docs/website/docs/dlt-ecosystem/transformations/pandas.md
@@ -21,9 +21,9 @@ pipeline = dlt.pipeline(
with pipeline.sql_client() as client:
with client.execute_query(
'SELECT "reactions__+1", "reactions__-1", reactions__laugh, reactions__hooray, reactions__rocket FROM issues'
- ) as table:
+ ) as cursor:
# calling `df` on a cursor, returns the data as a pandas data frame
- reactions = table.df()
+ reactions = cursor.df()
counts = reactions.sum(0).sort_values(0, ascending=False)
```
diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/airtable.md b/docs/website/docs/dlt-ecosystem/verified-sources/airtable.md
index a2e15bfd75..5d3d71f217 100644
--- a/docs/website/docs/dlt-ecosystem/verified-sources/airtable.md
+++ b/docs/website/docs/dlt-ecosystem/verified-sources/airtable.md
@@ -166,6 +166,8 @@ tables in the schema are loaded.
This function retrieves data from a single Airtable table.
```py
+import pyairtable
+
def airtable_resource(
api: pyairtable.Api,
base_id: str,
@@ -200,7 +202,7 @@ If you wish to create your own pipelines, you can leverage source and resource m
base_id = "Please set me up!" # The ID of the base.
airtables = airtable_source(base_id=base_id)
- load_info = pipeline.run(load_data, write_disposition="replace")
+ load_info = pipeline.run(airtables, write_disposition="replace")
```
1. To load selected tables from a base table:
diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/amazon_kinesis.md b/docs/website/docs/dlt-ecosystem/verified-sources/amazon_kinesis.md
index c4e4268647..7aa82f868e 100644
--- a/docs/website/docs/dlt-ecosystem/verified-sources/amazon_kinesis.md
+++ b/docs/website/docs/dlt-ecosystem/verified-sources/amazon_kinesis.md
@@ -251,8 +251,8 @@ If you wish to create your own pipelines, you can leverage source and resource m
# Load the state if it exists.
if os.path.exists(STATE_FILE):
- with open(STATE_FILE, "rb") as f:
- state = json.typed_loadb(f.read())
+ with open(STATE_FILE, "rb") as rf:
+ state = json.typed_loadb(rf.read())
else:
# Provide new state.
state = {}
@@ -266,8 +266,8 @@ If you wish to create your own pipelines, you can leverage source and resource m
print(message)
# Save state after each message to have full transaction load.
# DynamoDB is also OK.
- with open(STATE_FILE, "wb") as f:
- json.typed_dump(managed_state.state, f)
+ with open(STATE_FILE, "wb") as wf:
+ json.typed_dump(managed_state.state, wf)
print(managed_state.state)
```
diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/arrow-pandas.md b/docs/website/docs/dlt-ecosystem/verified-sources/arrow-pandas.md
index 29b5e5618c..11d4382a22 100644
--- a/docs/website/docs/dlt-ecosystem/verified-sources/arrow-pandas.md
+++ b/docs/website/docs/dlt-ecosystem/verified-sources/arrow-pandas.md
@@ -10,10 +10,10 @@ import Header from './_source-info-header.md';
You can load data directly from an Arrow table or Pandas dataframe.
-This is supported by all destinations, but it is especially recommended when using destinations that support the `parquet` file format natively (e.g., [Snowflake](../destinations/snowflake.md) and [Filesystem](../destinations/filesystem.md)).
+This is supported by all destinations, but it is especially recommended when using destinations that support the Parquet file format natively (e.g., [Snowflake](../destinations/snowflake.md) and [Filesystem](../destinations/filesystem.md)).
See the [destination support](#destination-support-and-fallback) section for more information.
-When used with a `parquet` supported destination, this is a more performant way to load structured data since `dlt` bypasses many processing steps normally involved in passing JSON objects through the pipeline.
+When used with a Parquet supported destination, this is a more performant way to load structured data since `dlt` bypasses many processing steps normally involved in passing JSON objects through the pipeline.
`dlt` automatically translates the Arrow table's schema to the destination table's schema and writes the table to a parquet file, which gets uploaded to the destination without any further processing.
## Usage
@@ -56,9 +56,9 @@ Note: The data in the table must be compatible with the destination database as
## Destination support
-Destinations that support the `parquet` format natively will have the data files uploaded directly as possible. Rewriting files can be avoided completely in many cases.
+Destinations that support the Parquet format natively will have the data files uploaded directly as possible. Rewriting files can be avoided completely in many cases.
-When the destination does not support `parquet`, the rows are extracted from the table and written in the destination's native format (usually `insert_values`), and this is generally much slower
+When the destination does not support Parquet, the rows are extracted from the table and written in the destination's native format (usually `insert_values`), and this is generally much slower
as it requires processing the table row by row and rewriting data to disk.
The output file format is chosen automatically based on the destination's capabilities, so you can load arrow or pandas frames to any destination, but performance will vary.
@@ -110,7 +110,7 @@ def orders(ordered_at = dlt.sources.incremental('ordered_at')):
# Get a dataframe/arrow table from somewhere
# If your database supports it, you can use the last_value to filter data at the source.
# Otherwise, it will be filtered automatically after loading the data.
- df = get_orders(since=ordered_at.last_value)
+ df = _get_orders(since=ordered_at.last_value)
yield df
pipeline = dlt.pipeline("orders_pipeline", destination="snowflake")
@@ -133,7 +133,7 @@ If you want to skip the default `dlt` JSON normalizer, you can use any available
import duckdb
conn = duckdb.connect()
-table = conn.execute(f"SELECT * FROM read_json_auto('{json_file_path}')").fetch_arrow_table()
+table = conn.execute("SELECT * FROM read_json_auto('./json_file_path')").fetch_arrow_table()
```
Note that **duckdb** and **pyarrow** methods will generate [nested types](#loading-nested-types) for nested data, which are only partially supported by `dlt`.
diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/asana.md b/docs/website/docs/dlt-ecosystem/verified-sources/asana.md
index 67e52596b2..7212ea18bf 100644
--- a/docs/website/docs/dlt-ecosystem/verified-sources/asana.md
+++ b/docs/website/docs/dlt-ecosystem/verified-sources/asana.md
@@ -194,10 +194,10 @@ This [incremental](../../general-usage/incremental-loading.md) resource-transfor
```py
@dlt.transformer(data_from=projects, write_disposition="merge", primary_key="gid")
def tasks(
- project_array: t.List[TDataItem],
+ project_array: List[TDataItem],
access_token: str = dlt.secrets.value,
modified_at: dlt.sources.incremental[str] = dlt.sources.incremental(
- "modified_at", initial_value=DEFAULT_START_DATE
+ "modified_at", initial_value=START_DATE_STRING
),
fields: Iterable[str] = TASK_FIELDS,
) -> Iterable[TDataItem]:
diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/chess.md b/docs/website/docs/dlt-ecosystem/verified-sources/chess.md
index 378eedaf62..9436244af7 100644
--- a/docs/website/docs/dlt-ecosystem/verified-sources/chess.md
+++ b/docs/website/docs/dlt-ecosystem/verified-sources/chess.md
@@ -123,7 +123,7 @@ def players_profiles(players: List[str]) -> Iterator[TDataItem]:
@dlt.defer
def _get_profile(username: str) -> TDataItem:
- return get_path_with_retry(f"player/{username}")
+ return _get_path_with_retry(f"player/{username}")
for username in players:
yield _get_profile(username)
@@ -200,7 +200,7 @@ To create your data loading pipeline for players and load data, follow these ste
```py
# Loads games for Nov 2022
- data = source(
+ source_instance = chess_source(
["magnuscarlsen", "vincentkeymer", "dommarajugukesh", "rpragchess"],
start_month="2022/11",
end_month="2022/11",
@@ -210,7 +210,7 @@ To create your data loading pipeline for players and load data, follow these ste
1. Use the method `pipeline.run()` to execute the pipeline.
```py
- info = pipeline.run(data)
+ info = pipeline.run(source_instance)
# print the information on data that was loaded
print(info)
```
@@ -218,7 +218,7 @@ To create your data loading pipeline for players and load data, follow these ste
1. To load data from specific resources like "players_games" and "player_profiles", modify the above code as:
```py
- info = pipeline.run(data.with_resources("players_games", "players_profiles"))
+ info = pipeline.run(source_instance.with_resources("players_games", "players_profiles"))
# print the information on data that was loaded
print(info)
```
diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/facebook_ads.md b/docs/website/docs/dlt-ecosystem/verified-sources/facebook_ads.md
index e559922c6d..bced6ae491 100644
--- a/docs/website/docs/dlt-ecosystem/verified-sources/facebook_ads.md
+++ b/docs/website/docs/dlt-ecosystem/verified-sources/facebook_ads.md
@@ -205,9 +205,10 @@ states.
def ads(
fields: Sequence[str] = DEFAULT_AD_FIELDS,
states: Sequence[str] = None,
+ chunk_size: int = 50
) -> Iterator[TDataItems]:
- yield get_data_chunked(account.get_ads, fields, states, chunk_size)
+ yield _get_data_chunked(account.get_ads, fields, states, chunk_size)
```
`fields`: Retrieves fields for each ad. For example, “id”, “name”, “adset_id”, etc.
@@ -288,7 +289,7 @@ This function fetches Facebook insights data incrementally from a specified star
@dlt.resource(primary_key=INSIGHTS_PRIMARY_KEY, write_disposition="merge")
def facebook_insights(
date_start: dlt.sources.incremental[str] = dlt.sources.incremental(
- "date_start", initial_value=initial_load_start_date_str
+ "date_start", initial_value=START_DATE_STRING
)
) -> Iterator[TDataItems]:
...
diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/filesystem/advanced.md b/docs/website/docs/dlt-ecosystem/verified-sources/filesystem/advanced.md
index a66a7b1d7f..40824a91f9 100644
--- a/docs/website/docs/dlt-ecosystem/verified-sources/filesystem/advanced.md
+++ b/docs/website/docs/dlt-ecosystem/verified-sources/filesystem/advanced.md
@@ -167,7 +167,7 @@ from dlt.sources.filesystem import filesystem
def _copy(item: FileItemDict) -> FileItemDict:
# Instantiate fsspec and copy file
- dest_file = os.path.join(local_folder, item["file_name"])
+ dest_file = os.path.join("./local_folder", item["file_name"])
# Create destination folder
os.makedirs(os.path.dirname(dest_file), exist_ok=True)
# Download file
diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/filesystem/basic.md b/docs/website/docs/dlt-ecosystem/verified-sources/filesystem/basic.md
index 6df10323dd..e3f9cfc138 100644
--- a/docs/website/docs/dlt-ecosystem/verified-sources/filesystem/basic.md
+++ b/docs/website/docs/dlt-ecosystem/verified-sources/filesystem/basic.md
@@ -252,8 +252,8 @@ bucket_url='~\Documents\csv_files\'
You can also specify the credentials using environment variables. The name of the corresponding environment variable should be slightly different from the corresponding name in the TOML file. Simply replace dots `.` with double underscores `__`:
```sh
-export SOURCES__FILESYSTEM__AWS_ACCESS_KEY_ID = "Please set me up!"
-export SOURCES__FILESYSTEM__AWS_SECRET_ACCESS_KEY = "Please set me up!"
+export SOURCES__FILESYSTEM__CREDENTIALS__AWS_ACCESS_KEY_ID = "Please set me up!"
+export SOURCES__FILESYSTEM__CREDENTIALS__AWS_SECRET_ACCESS_KEY = "Please set me up!"
```
:::tip
diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/freshdesk.md b/docs/website/docs/dlt-ecosystem/verified-sources/freshdesk.md
index 63c26de670..d830a630f8 100644
--- a/docs/website/docs/dlt-ecosystem/verified-sources/freshdesk.md
+++ b/docs/website/docs/dlt-ecosystem/verified-sources/freshdesk.md
@@ -145,7 +145,7 @@ This function creates and yields a dlt resource for each endpoint in
def freshdesk_source(
#args as defined above
) -> Iterable[DltResource]:
- for endpoint in endpoints:
+ for endpoint in ENDPOINTS:
yield dlt.resource(
incremental_resource,
name=endpoint,
diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/github.md b/docs/website/docs/dlt-ecosystem/verified-sources/github.md
index 221a2c3009..b17ce08403 100644
--- a/docs/website/docs/dlt-ecosystem/verified-sources/github.md
+++ b/docs/website/docs/dlt-ecosystem/verified-sources/github.md
@@ -132,7 +132,20 @@ def github_reactions(
max_items: int = None,
max_item_age_seconds: float = None,
) -> Sequence[DltResource]:
- ...
+
+ return dlt.resource(
+ _get_reactions_data(
+ "issues",
+ owner,
+ name,
+ access_token,
+ items_per_page,
+ max_items,
+ max_item_age_seconds,
+ ),
+ name="issues",
+ write_disposition="replace",
+ )
```
`owner`: Refers to the owner of the repository.
@@ -151,22 +164,6 @@ def github_reactions(
The `dlt.resource` function employs the `_get_reactions_data` method to retrieve data about issues, their associated comments, and subsequent reactions.
-```py
-dlt.resource(
- _get_reactions_data(
- "issues",
- owner,
- name,
- access_token,
- items_per_page,
- max_items,
- max_item_age_seconds,
- ),
- name="issues",
- write_disposition="replace",
-),
-```
-
### Source `github_repo_events`
This `dlt.source` fetches repository events incrementally, dispatching them to separate tables based on event type. It loads new events only and appends them to tables.
@@ -252,7 +249,7 @@ If you wish to create your own pipelines, you can leverage source and resource m
```py
load_data = github_repo_events(
- "duckdb", "duckdb", access_token=os.getenv(ACCESS_TOKEN)
+ "duckdb", "duckdb", access_token=os.getenv("ACCESS_TOKEN_ENV_VAR")
)
load_info = pipeline.run(load_data)
print(load_info)
diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/google_ads.md b/docs/website/docs/dlt-ecosystem/verified-sources/google_ads.md
index 5e8b247ffd..9e2942f699 100644
--- a/docs/website/docs/dlt-ecosystem/verified-sources/google_ads.md
+++ b/docs/website/docs/dlt-ecosystem/verified-sources/google_ads.md
@@ -224,6 +224,7 @@ This function returns a list of resources including metadata, fields, and metric
the Google Ads API.
```py
+@dlt.source()
def google_ads(
credentials: Union[
GcpOAuthCredentials, GcpServiceAccountCredentials
diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/google_analytics.md b/docs/website/docs/dlt-ecosystem/verified-sources/google_analytics.md
index b94606a7e9..03f87b83f4 100644
--- a/docs/website/docs/dlt-ecosystem/verified-sources/google_analytics.md
+++ b/docs/website/docs/dlt-ecosystem/verified-sources/google_analytics.md
@@ -249,10 +249,10 @@ the Google Analytics API.
```py
@dlt.source(max_table_nesting=2)
def google_analytics(
- credentials: Union[ GcpOAuthCredentials, GcpServiceAccountCredential ] = dlt.secrets.value,
+ credentials: Union[ GcpOAuthCredentials, GcpServiceAccountCredentials ] = dlt.secrets.value,
property_id: int = dlt.config.value,
queries: List[DictStrAny] = dlt.config.value,
- start_date: Optional[str] = START_DATE,
+ start_date: Optional[str] = START_DATE_STRING,
rows_per_page: int = 1000,
) -> List[DltResource]:
...
diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/google_sheets.md b/docs/website/docs/dlt-ecosystem/verified-sources/google_sheets.md
index fade2f73a6..b230f481ea 100644
--- a/docs/website/docs/dlt-ecosystem/verified-sources/google_sheets.md
+++ b/docs/website/docs/dlt-ecosystem/verified-sources/google_sheets.md
@@ -369,6 +369,7 @@ This function loads data from a Google Spreadsheet. It retrieves data from all s
whether explicitly defined or named, and obtains metadata for the first two rows within each range.
```py
+@dlt.source()
def google_spreadsheet(
spreadsheet_url_or_id: str = dlt.config.value,
range_names: Sequence[str] = dlt.config.value,
@@ -399,7 +400,7 @@ separate tables in the destination.
```py
dlt.resource(
- process_range(rows_data, headers=headers, data_types=data_types),
+ process_range(data, headers=headers, data_types=data_types),
name=name,
write_disposition="replace",
)
@@ -547,7 +548,7 @@ If you wish to create your own pipelines, you can leverage source and resource m
get_named_ranges=False,
)
- data.resources["Sheet 1!A1:B10"].apply_hints(table_name="loaded_data_1")
+ load_data.resources["Sheet 1!A1:B10"].apply_hints(table_name="loaded_data_1")
load_info = pipeline.run(load_data)
print(load_info)
@@ -580,9 +581,11 @@ Below is the correct way to set up an Airflow DAG for this purpose:
- When adding the Google Spreadsheet task to the pipeline, avoid decomposing it; run it as a single task for efficiency.
```py
+from dlt.helpers.airflow_helper import PipelineTasksGroup
+
@dag(
schedule_interval='@daily',
- start_date=pendulum.datetime(2023, 2, 1),
+ start_date=pendulum.DateTime(2023, 2, 1),
catchup=False,
max_active_runs=1,
default_args=default_task_args
diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/hubspot.md b/docs/website/docs/dlt-ecosystem/verified-sources/hubspot.md
index 02c651a603..da3b6f7178 100644
--- a/docs/website/docs/dlt-ecosystem/verified-sources/hubspot.md
+++ b/docs/website/docs/dlt-ecosystem/verified-sources/hubspot.md
@@ -140,6 +140,7 @@ This function returns a list of resources to load companies, contacts, deals, ti
def hubspot(
api_key: str = dlt.secrets.value,
include_history: bool = False,
+ include_custom_props: bool = False,
) -> Sequence[DltResource]:
...
```
@@ -155,8 +156,8 @@ This resource function fetches data from the "companies" endpoint and loads it t
```py
@dlt.resource(name="companies", write_disposition="replace")
def companies(
- api_key: str = api_key,
- include_history: bool = include_history,
+ api_key: str = API_KEY,
+ include_history: bool = False,
props: Sequence[str] = DEFAULT_COMPANY_PROPS,
include_custom_props: bool = True,
) -> Iterator[TDataItems]:
@@ -182,7 +183,7 @@ def hubspot_events_for_objects(
object_type: THubspotObjectType,
object_ids: List[str],
api_key: str = dlt.secrets.value,
- start_date: pendulum.DateTime = STARTDATE,
+ start_date: DateTime = START_DATE,
) -> DltResource:
...
```
diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/inbox.md b/docs/website/docs/dlt-ecosystem/verified-sources/inbox.md
index 3d7b577c0f..e6264d24d4 100644
--- a/docs/website/docs/dlt-ecosystem/verified-sources/inbox.md
+++ b/docs/website/docs/dlt-ecosystem/verified-sources/inbox.md
@@ -141,10 +141,10 @@ def inbox_source(
password: str = dlt.secrets.value,
folder: str = "INBOX",
gmail_group: Optional[str] = GMAIL_GROUP,
- start_date: pendulum.DateTime = DEFAULT_START_DATE,
+ start_date: pendulum.DateTime = START_DATE,
filter_emails: Sequence[str] = None,
filter_by_mime_type: Sequence[str] = None,
- chunksize: int = DEFAULT_CHUNK_SIZE,
+ chunksize: int = CHUNK_SIZE,
) -> Sequence[DltResource]:
...
```
@@ -159,13 +159,13 @@ def inbox_source(
`gmail_group`: Google Group email for filtering. Default: `/inbox/settings.py` 'GMAIL_GROUP'.
-`start_date`: Start date to collect emails. Default: `/inbox/settings.py` 'DEFAULT_START_DATE'.
+`start_date`: Start date to collect emails. Default: `/inbox/settings.py` 'START_DATE'.
`filter_emails`: Email addresses for 'FROM' filtering. Default: `/inbox/settings.py` 'FILTER_EMAILS'.
`filter_by_mime_type`: MIME types for attachment filtering. Default: None.
-`chunksize`: UIDs collected per batch. Default: `/inbox/settings.py` 'DEFAULT_CHUNK_SIZE'.
+`chunksize`: UIDs collected per batch. Default: `/inbox/settings.py` 'CHUNK_SIZE'.
### Resource `get_messages_uids`
@@ -240,7 +240,7 @@ verified source.
2. To load messages from "mycreditcard@bank.com" starting "2023-10-1":
- - Set `DEFAULT_START_DATE = pendulum.datetime(2023, 10, 1)` in `./inbox/settings.py`.
+ - Set `START_DATE = pendulum.DateTime(2023, 10, 1)` in `./inbox/settings.py`.
- Use the following code:
```py
# Retrieve messages from the specified email address.
diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/index.md b/docs/website/docs/dlt-ecosystem/verified-sources/index.md
index d37b4393d4..1bf2d93159 100644
--- a/docs/website/docs/dlt-ecosystem/verified-sources/index.md
+++ b/docs/website/docs/dlt-ecosystem/verified-sources/index.md
@@ -12,7 +12,7 @@ Planning to use `dlt` in production and need a source that isn't listed? We're h
### Core sources
item.label === '30+ SQL Databases' || item.label === 'REST APIs' || item.label === 'Filesystem & cloud storage'
+item => item.label === '30+ SQL Databases' || item.label === 'REST APIs' || item.label === 'Cloud storage and filesystem'
)} />
### Verified sources
@@ -24,7 +24,7 @@ If you couldn't find a source implementation, you can easily create your own. Ch
:::
item.label !== '30+ SQL Databases' && item.label !== 'REST APIs' && item.label !== 'Filesystem & cloud storage'
+item => item.label !== '30+ SQL Databases' && item.label !== 'REST APIs' && item.label !== 'Cloud storage and filesystem'
)} />
### What's the difference between core and verified sources?
diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/kafka.md b/docs/website/docs/dlt-ecosystem/verified-sources/kafka.md
index a402e2c5f0..478f7ee52d 100644
--- a/docs/website/docs/dlt-ecosystem/verified-sources/kafka.md
+++ b/docs/website/docs/dlt-ecosystem/verified-sources/kafka.md
@@ -164,8 +164,8 @@ this offset.
```py
topics = ["topic1", "topic2", "topic3"]
- source = kafka_consumer(topics)
- pipeline.run(source, write_disposition="replace")
+ resource = kafka_consumer(topics)
+ pipeline.run(resource, write_disposition="replace")
```
3. To extract messages and process them in a custom way:
@@ -181,15 +181,15 @@ this offset.
"data": msg.value().decode("utf-8"),
}
- data = kafka_consumer("topic", msg_processor=custom_msg_processor)
- pipeline.run(data)
+ resource = kafka_consumer("topic", msg_processor=custom_msg_processor)
+ pipeline.run(resource)
```
4. To extract messages, starting from a timestamp:
```py
- data = kafka_consumer("topic", start_from=pendulum.datetime(2023, 12, 15))
- pipeline.run(data)
+ resource = kafka_consumer("topic", start_from=pendulum.DateTime(2023, 12, 15))
+ pipeline.run(resource)
```
diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/matomo.md b/docs/website/docs/dlt-ecosystem/verified-sources/matomo.md
index 4322c6806a..e03bf993c5 100644
--- a/docs/website/docs/dlt-ecosystem/verified-sources/matomo.md
+++ b/docs/website/docs/dlt-ecosystem/verified-sources/matomo.md
@@ -148,6 +148,7 @@ def matomo_reports(
The function loads visits from the current day and the past `initial_load_past_days` on the first run. In subsequent runs, it continues from the last load and skips active visits until they are closed.
```py
+@dlt.source()
def matomo_visits(
api_token: str = dlt.secrets.value,
url: str = dlt.config.value,
@@ -254,7 +255,7 @@ If you wish to create your own pipelines, you can leverage source and resource m
```py
data_reports = matomo_reports()
- load_info = pipeline_reports.run(data_reports)
+ load_info = pipeline.run(data_reports)
print(load_info)
```
> "site_id" defined in ".dlt/config.toml"
@@ -275,7 +276,7 @@ If you wish to create your own pipelines, you can leverage source and resource m
site_id = 1 # ID of the site for which reports are being loaded
load_data = matomo_reports(queries=queries, site_id=site_id)
- load_info = pipeline_reports.run(load_data)
+ load_info = pipeline.run(load_data)
print(load_info)
```
> You can pass queries and site_id in the ".dlt/config.toml" as well.
@@ -285,7 +286,7 @@ If you wish to create your own pipelines, you can leverage source and resource m
```py
data_reports = matomo_reports()
data_events = matomo_visits()
- load_info = pipeline_reports.run([data_reports, data_events])
+ load_info = pipeline.run([data_reports, data_events])
print(load_info)
```
@@ -293,7 +294,7 @@ If you wish to create your own pipelines, you can leverage source and resource m
```py
load_data = matomo_visits(initial_load_past_days=1, get_live_event_visitors=True)
- load_info = pipeline_events.run(load_data)
+ load_info = pipeline.run(load_data)
print(load_info)
```
diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/mongodb.md b/docs/website/docs/dlt-ecosystem/verified-sources/mongodb.md
index 9225797773..44d4258cd5 100644
--- a/docs/website/docs/dlt-ecosystem/verified-sources/mongodb.md
+++ b/docs/website/docs/dlt-ecosystem/verified-sources/mongodb.md
@@ -231,6 +231,7 @@ def mongodb_collection(
collection: str = dlt.config.value,
incremental: Optional[dlt.sources.incremental] = None, # type: ignore[type-arg]
write_disposition: Optional[str] = dlt.config.value,
+ data_item_format: Optional[TDataItemFormat] = "object",
) -> Any:
...
```
diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/mux.md b/docs/website/docs/dlt-ecosystem/verified-sources/mux.md
index 2ae14de2dc..c533f453e0 100644
--- a/docs/website/docs/dlt-ecosystem/verified-sources/mux.md
+++ b/docs/website/docs/dlt-ecosystem/verified-sources/mux.md
@@ -130,6 +130,8 @@ def mux_source() -> Iterable[DltResource]:
The assets_resource function fetches metadata about video assets from the Mux API's "assets" endpoint.
```py
+DEFAULT_LIMIT = 100
+
@dlt.resource(write_disposition="merge")
def assets_resource(
mux_api_access_token: str = dlt.secrets.value,
diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/personio.md b/docs/website/docs/dlt-ecosystem/verified-sources/personio.md
index f638b5670c..b9b5a36bbd 100644
--- a/docs/website/docs/dlt-ecosystem/verified-sources/personio.md
+++ b/docs/website/docs/dlt-ecosystem/verified-sources/personio.md
@@ -129,7 +129,7 @@ This `dlt` source returns data resources like `employees`, `absences`, `absence_
def personio_source(
client_id: str = dlt.secrets.value,
client_secret: str = dlt.secrets.value,
- items_per_page: int = DEFAULT_ITEMS_PER_PAGE,
+ items_per_page: int = ITEMS_PER_PAGE,
) -> Iterable[DltResource]:
...
return (
@@ -163,7 +163,7 @@ def employees(
] = dlt.sources.incremental(
"last_modified_at", initial_value=None, allow_external_schedulers=True
),
- items_per_page: int = items_per_page,
+ items_per_page: int = ITEMS_PER_PAGE,
) -> Iterable[TDataItem]:
...
```
@@ -183,7 +183,7 @@ data incrementally from the Personio API to your preferred destination.
Simple resource, which retrieves a list of various types of employee absences.
```py
@dlt.resource(primary_key="id", write_disposition="replace")
-def absence_types(items_per_page: int = items_per_page) -> Iterable[TDataItem]:
+def absence_types(items_per_page: int = ITEMS_PER_PAGE) -> Iterable[TDataItem]:
...
...
```
diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/rest_api/basic.md b/docs/website/docs/dlt-ecosystem/verified-sources/rest_api/basic.md
index fa11fdb22d..d23f3f139e 100644
--- a/docs/website/docs/dlt-ecosystem/verified-sources/rest_api/basic.md
+++ b/docs/website/docs/dlt-ecosystem/verified-sources/rest_api/basic.md
@@ -234,13 +234,13 @@ The configuration object passed to the REST API Generic Source has three main el
```py
config: RESTAPIConfig = {
"client": {
- ...
+ # ...
},
"resource_defaults": {
- ...
+ # ...
},
"resources": [
- ...
+ # ...
],
}
```
@@ -277,15 +277,13 @@ config = {
"resources": [
"resource1",
{
- "resource2": {
- "name": "resource2_name",
- "write_disposition": "append",
- "endpoint": {
- "params": {
- "param1": "value1",
- },
+ "name": "resource2_name",
+ "write_disposition": "append",
+ "endpoint": {
+ "params": {
+ "param1": "value1",
},
- }
+ },
}
],
}
@@ -419,7 +417,13 @@ For more complex pagination methods, you can implement a [custom paginator](../.
Alternatively, you can use the dictionary configuration syntax also for custom paginators. For this, you need to register your custom paginator:
```py
-rest_api.config_setup.register_paginator("custom_paginator", CustomPaginator)
+from dlt.sources.rest_api.config_setup import register_paginator
+
+class CustomPaginator(SinglePagePaginator):
+ # custom implementation of SinglePagePaginator
+ pass
+
+register_paginator("custom_paginator", CustomPaginator)
{
# ...
@@ -539,6 +543,8 @@ config = {
"client": {
"auth": BearerTokenAuth(dlt.secrets["your_api_token"]),
},
+ "resources": [
+ ]
# ...
}
```
@@ -562,7 +568,12 @@ For more complex authentication methods, you can implement a [custom authenticat
You can use the dictionary configuration syntax also for custom authentication classes after registering them as follows:
```py
-rest_api.config_setup.register_auth("custom_auth", CustomAuth)
+from dlt.sources.rest_api.config_setup import register_auth
+
+class CustomAuth(AuthConfigBase):
+ pass
+
+register_auth("custom_auth", CustomAuth)
{
# ...
@@ -715,7 +726,7 @@ Instead of defining three different issues resources, one for each of the paths
from dlt.sources.rest_api import RESTAPIConfig
@dlt.resource()
-def repositories() -> Generator[List[Dict[str, Any]]]:
+def repositories() -> Generator[List[Dict[str, Any]], Any, Any]:
"""A seed list of repositories to fetch"""
yield [{"name": "dlt"}, {"name": "verified-sources"}, {"name": "dlthub-education"}]
@@ -745,7 +756,7 @@ Be careful that the parent resource needs to return `Generator[List[Dict[str, An
```py
@dlt.resource
-def repositories() -> Generator[Dict[str, Any]]:
+def repositories() -> Generator[Dict[str, Any], Any, Any]:
"""Not working seed list of repositories to fetch"""
yield from [{"name": "dlt"}, {"name": "verified-sources"}, {"name": "dlthub-education"}]
```
@@ -946,7 +957,7 @@ The full available configuration for the `incremental` field is:
"cursor_path": "",
"initial_value": "",
"end_value": "",
- "convert": a_callable,
+ "convert": my_callable,
}
}
```
diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/salesforce.md b/docs/website/docs/dlt-ecosystem/verified-sources/salesforce.md
index 8864b3b629..570d07f7ea 100644
--- a/docs/website/docs/dlt-ecosystem/verified-sources/salesforce.md
+++ b/docs/website/docs/dlt-ecosystem/verified-sources/salesforce.md
@@ -168,7 +168,7 @@ This resource function retrieves records from the Salesforce "User" endpoint.
```py
@dlt.resource(write_disposition="replace")
def sf_user() -> Iterator[Dict[str, Any]]:
- yield from get_records(client, "User")
+ yield from _get_records(client, "User")
```
Besides "sf_user", there are several resources that use replace mode for data writing to the
@@ -193,14 +193,14 @@ def opportunity(
)
) -> Iterator[Dict[str, Any]]:
- yield from get_records(
+ yield from _get_records(
client, "Opportunity", last_timestamp.last_value, "SystemModstamp"
)
```
`last_timestamp`: Argument that will receive [incremental](../../general-usage/incremental-loading)
state, initialized with "initial_value". It is configured to track the "SystemModstamp" field in data
-items returned by "get_records" and then yielded. It will store the newest "SystemModstamp" value in
+items returned by "_get_records" and then yielded. It will store the newest "SystemModstamp" value in
dlt state and make it available in "last_timestamp.last_value" on the next pipeline run.
Besides "opportunity", there are several resources that use replace mode for data writing to the
@@ -235,8 +235,10 @@ To create your data pipeline using single loading and [incremental data loading]
1. To load data from all the endpoints, use the `salesforce_source` method as follows:
```py
+ from dlt.common.schema.typing import TSimpleRegex
+
load_data = salesforce_source()
- source.schema.merge_hints({"not_null": ["id"]}) # Hint for id field not null
+ source.schema.merge_hints({"not_null": [TSimpleRegex("id")]}) # Hint for id field not null
load_info = pipeline.run(load_data)
# print the information on data that was loaded
print(load_info)
diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/shopify.md b/docs/website/docs/dlt-ecosystem/verified-sources/shopify.md
index fe11491bd6..d82b208482 100644
--- a/docs/website/docs/dlt-ecosystem/verified-sources/shopify.md
+++ b/docs/website/docs/dlt-ecosystem/verified-sources/shopify.md
@@ -145,14 +145,15 @@ For more information, read the guide on [how to run a pipeline](../../walkthroug
This function returns a list of resources to load products, orders, and customers data from the Shopify API.
```py
+@dlt.source()
def shopify_source(
private_app_password: str = dlt.secrets.value,
- api_version: str = DEFAULT_API_VERSION,
+ api_version: str = API_VERSION,
shop_url: str = dlt.config.value,
start_date: TAnyDateTime = FIRST_DAY_OF_MILLENNIUM,
end_date: Optional[TAnyDateTime] = None,
created_at_min: TAnyDateTime = FIRST_DAY_OF_MILLENNIUM,
- items_per_page: int = DEFAULT_ITEMS_PER_PAGE,
+ items_per_page: int = ITEMS_PER_PAGE,
order_status: TOrderStatus = "any",
) -> Iterable[DltResource]:
...
@@ -185,12 +186,12 @@ def products(
pendulum.DateTime
] = dlt.sources.incremental(
"updated_at",
- initial_value=start_date_obj,
- end_value=end_date_obj,
+ initial_value=START_DATE,
+ end_value=END_DATE,
allow_external_schedulers=True,
),
created_at_min: pendulum.DateTime = created_at_min_obj,
- items_per_page: int = items_per_page,
+ items_per_page: int = ITEMS_PER_PAGE,
) -> Iterable[TDataItem]:
...
```
@@ -212,7 +213,7 @@ def shopify_partner_query(
variables: Optional[Dict[str, Any]] = None,
access_token: str = dlt.secrets.value,
organization_id: str = dlt.config.value,
- api_version: str = DEFAULT_PARTNER_API_VERSION,
+ api_version: str = API_VERSION,
) -> Iterable[TDataItem]:
...
```
@@ -258,9 +259,8 @@ If you wish to create your own pipelines, you can leverage source and resource m
```py
# Add your desired resources to the list...
resources = ["products", "orders", "customers"]
- start_date="2023-01-01"
- load_data = shopify_source(start_date=start_date).with_resources(*resources)
+ load_data = shopify_source(start_date="2023-01-01").with_resources(*resources)
load_info = pipeline.run(load_data)
print(load_info)
```
@@ -269,7 +269,7 @@ If you wish to create your own pipelines, you can leverage source and resource m
```py
# Load all orders from 2023-01-01 to now
- min_start_date = current_start_date = pendulum.datetime(2023, 1, 1)
+ min_start_date = current_start_date = pendulum.DateTime(2023, 1, 1)
max_end_date = pendulum.now()
# Create a list of time ranges of 1 week each, we'll use this to load the data in chunks
ranges: List[Tuple[pendulum.DateTime, pendulum.DateTime]] = []
diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/slack.md b/docs/website/docs/dlt-ecosystem/verified-sources/slack.md
index 35b12bb64f..5c92ea54f2 100644
--- a/docs/website/docs/dlt-ecosystem/verified-sources/slack.md
+++ b/docs/website/docs/dlt-ecosystem/verified-sources/slack.md
@@ -140,7 +140,7 @@ It retrieves data from Slack's API and fetches the Slack data such as channels,
def slack_source(
page_size: int = MAX_PAGE_SIZE,
access_token: str = dlt.secrets.value,
- start_date: Optional[TAnyDateTime] = DEFAULT_START_DATE,
+ start_date: Optional[TAnyDateTime] = START_DATE,
end_date: Optional[TAnyDateTime] = None,
selected_channels: Optional[List[str]] = dlt.config.value,
) -> Iterable[DltResource]:
@@ -186,8 +186,8 @@ def get_messages_resource(
channel_data: Dict[str, Any],
created_at: dlt.sources.incremental[DateTime] = dlt.sources.incremental(
"ts",
- initial_value=start_dt,
- end_value=end_dt,
+ initial_value=START_DATE,
+ end_value=END_DATE,
allow_external_schedulers=True,
),
) -> Iterable[TDataItem]:
@@ -246,7 +246,7 @@ If you wish to create your own pipelines, you can leverage source and resource m
1. To load Slack resources from the specified start date:
```py
- source = slack_source(page_size=1000, start_date=datetime(2023, 9, 1), end_date=datetime(2023, 9, 8))
+ source = slack_source(page_size=1000, start_date=datetime.datetime(2023, 9, 1), end_date=datetime.datetime(2023, 9, 8))
# Enable below to load only 'access_logs', available for paid accounts only.
# source.access_logs.selected = True
@@ -266,8 +266,8 @@ If you wish to create your own pipelines, you can leverage source and resource m
source = slack_source(
page_size=20,
selected_channels=selected_channels,
- start_date=datetime(2023, 9, 1),
- end_date=datetime(2023, 9, 8),
+ start_date=datetime.datetime(2023, 9, 1),
+ end_date=datetime.datetime(2023, 9, 8),
)
# It loads data starting from 1st September 2023 to 8th September 2023 from the channels: "general" and "random".
load_info = pipeline.run(source)
@@ -283,8 +283,8 @@ If you wish to create your own pipelines, you can leverage source and resource m
source = slack_source(
page_size=20,
selected_channels=selected_channels,
- start_date=datetime(2023, 9, 1),
- end_date=datetime(2023, 9, 8),
+ start_date=datetime.datetime(2023, 9, 1),
+ end_date=datetime.datetime(2023, 9, 8),
)
# It loads only messages from the channel "general".
load_info = pipeline.run(source.with_resources("general"))
diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/sql_database/advanced.md b/docs/website/docs/dlt-ecosystem/verified-sources/sql_database/advanced.md
index 886d83658d..2d384e6411 100644
--- a/docs/website/docs/dlt-ecosystem/verified-sources/sql_database/advanced.md
+++ b/docs/website/docs/dlt-ecosystem/verified-sources/sql_database/advanced.md
@@ -44,8 +44,8 @@ Incremental loading uses a cursor column (e.g., timestamp or auto-incrementing I
)
pipeline = dlt.pipeline(destination="duckdb")
- info = pipeline.extract(table, write_disposition="merge")
- print(info)
+ extract_info = pipeline.extract(table, write_disposition="merge")
+ print(extract_info)
```
Behind the scene, the loader generates a SQL query filtering rows with `last_modified` values greater than the incremental value. In the first run, this is the initial value (midnight (00:00:00) January 1, 2024).
@@ -65,8 +65,8 @@ Incremental loading uses a cursor column (e.g., timestamp or auto-incrementing I
# Running the pipeline
pipeline = dlt.pipeline(destination="duckdb")
- info = pipeline.run(source, write_disposition="merge")
- print(info)
+ load_info = pipeline.run(source, write_disposition="merge")
+ print(load_info)
```
:::info
diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/sql_database/configuration.md b/docs/website/docs/dlt-ecosystem/verified-sources/sql_database/configuration.md
index acc223f54d..e6058204b0 100644
--- a/docs/website/docs/dlt-ecosystem/verified-sources/sql_database/configuration.md
+++ b/docs/website/docs/dlt-ecosystem/verified-sources/sql_database/configuration.md
@@ -143,7 +143,7 @@ credentials = ConnectionStringCredentials(
"mysql+pymysql://rfamro@mysql-rfam-public.ebi.ac.uk:4497/Rfam"
)
-source = sql_database(credentials).with_resource("family")
+source = sql_database(credentials).with_resources("family")
```
:::note
diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/sql_database/troubleshooting.md b/docs/website/docs/dlt-ecosystem/verified-sources/sql_database/troubleshooting.md
index d0930716d8..d5c56d3051 100644
--- a/docs/website/docs/dlt-ecosystem/verified-sources/sql_database/troubleshooting.md
+++ b/docs/website/docs/dlt-ecosystem/verified-sources/sql_database/troubleshooting.md
@@ -56,6 +56,16 @@ sources.sql_database.credentials="mssql+pyodbc://loader:loader@localhost/dlt_dat
sources.sql_database.credentials="mssql+pyodbc://loader:loader@localhost/dlt_data?LongAsMax=yes&driver=ODBC+Driver 17+for+SQL+Server"
```
+**To fix MS SQL Server connection issues with ConnectorX**:
+
+Some users have reported issues with MS SQL Server and Connector X. The problems are not caused by dlt, but by how connections are made. A big thanks to [Mark-James M](https://github.com/markjamesm) for suggesting a solution.
+
+To fix connection issues with ConnectorX and MS SQL Server, include both `Encrypt=yes` and `encrypt=true` in your connection string:
+```toml
+sources.sql_database.credentials="mssql://user:password@server:1433/database?driver=ODBC+Driver+17+for+SQL+Server&Encrypt=yes&encrypt=true"
+```
+This approach can help resolve connection-related issues.
+
## Troubleshooting backends
### Notes on specific databases
diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/stripe.md b/docs/website/docs/dlt-ecosystem/verified-sources/stripe.md
index 15f75ac313..d1223b879e 100644
--- a/docs/website/docs/dlt-ecosystem/verified-sources/stripe.md
+++ b/docs/website/docs/dlt-ecosystem/verified-sources/stripe.md
@@ -120,7 +120,7 @@ You can write your own pipelines to load data to a destination using this verifi
```py
# The most popular Stripe API's endpoints
-ENDPOINTS = ("Subscription", "Account", "Coupon", "Customer", "Product", "Price")
+STRIPE_ENDPOINTS = ("Subscription", "Account", "Coupon", "Customer", "Product", "Price")
# Possible incremental endpoints
# The incremental endpoints default to Stripe API endpoints with uneditable data.
INCREMENTAL_ENDPOINTS = ("Event", "Invoice", "BalanceTransaction")
@@ -134,7 +134,7 @@ This function retrieves data from the Stripe API for the specified endpoint:
```py
@dlt.source
def stripe_source(
- endpoints: Tuple[str, ...] = ENDPOINTS,
+ endpoints: Tuple[str, ...] = STRIPE_ENDPOINTS,
stripe_secret_key: str = dlt.secrets.value,
start_date: Optional[DateTime] = None,
end_date: Optional[DateTime] = None,
@@ -192,8 +192,8 @@ If you wish to create your own pipelines, you can leverage source and resource m
```py
source_single = stripe_source(
endpoints=("Plan", "Charge"),
- start_date=datetime(2022, 1, 1),
- end_date=datetime(2022, 12, 31),
+ start_date=pendulum.DateTime(2022, 1, 1),
+ end_date=pendulum.DateTime(2022, 12, 31),
)
load_info = pipeline.run(source_single)
print(load_info)
@@ -205,8 +205,8 @@ If you wish to create your own pipelines, you can leverage source and resource m
# Load all data on the first run that was created after start_date and before end_date
source_incremental = incremental_stripe_source(
endpoints=("Invoice", ),
- initial_start_date=datetime(2022, 1, 1),
- end_date=datetime(2022, 12, 31),
+ initial_start_date=pendulum.DateTime(2022, 1, 1),
+ end_date=pendulum.DateTime(2022, 12, 31),
)
load_info = pipeline.run(source_incremental)
print(load_info)
@@ -218,7 +218,7 @@ If you wish to create your own pipelines, you can leverage source and resource m
```py
source_single = stripe_source(
endpoints=("Plan", "Charge"),
- start_date=datetime(2022, 12, 31),
+ start_date=pendulum.DateTime(2022, 12, 31),
)
source_incremental = incremental_stripe_source(
endpoints=("Invoice", ),
diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/workable.md b/docs/website/docs/dlt-ecosystem/verified-sources/workable.md
index 1d6f59dd3e..ebec3cc8c7 100644
--- a/docs/website/docs/dlt-ecosystem/verified-sources/workable.md
+++ b/docs/website/docs/dlt-ecosystem/verified-sources/workable.md
@@ -236,7 +236,7 @@ To create your data pipeline using single loading and [incremental data loading]
1. To load data from a specific date, including dependent endpoints:
```py
- load_data = workable_source(start_date=datetime(2022, 1, 1), load_details=True)
+ load_data = workable_source(start_date=pendulum.DateTime(2022, 1, 1), load_details=True)
load_info = pipeline.run(load_data)
print(load_info)
```
@@ -258,7 +258,7 @@ To create your data pipeline using single loading and [incremental data loading]
1. To load data from the “jobs” endpoint and its dependent endpoints like "activities" and "application_form":
```py
- load_data = workable_source(start_date=datetime(2022, 2, 1), load_details=True)
+ load_data = workable_source(start_date=pendulum.DateTime(2022, 2, 1), load_details=True)
# Set the load_details as True to load all the dependent endpoints.
load_info = pipeline.run(load_data.with_resources("jobs","jobs_activities","jobs_application_form"))
print(load_info)
diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/zendesk.md b/docs/website/docs/dlt-ecosystem/verified-sources/zendesk.md
index b34bc83087..9dae73f6ac 100644
--- a/docs/website/docs/dlt-ecosystem/verified-sources/zendesk.md
+++ b/docs/website/docs/dlt-ecosystem/verified-sources/zendesk.md
@@ -180,7 +180,7 @@ For more information, read the guide on [how to add a verified source.](../../wa
information securely, like access tokens. Keep this file safe. Here's its format for service
account authentication:
- ```py
+ ```toml
#Zendesk support credentials
[sources.zendesk.credentials]
subdomain = "subdomain" # Zendesk subdomain
@@ -247,7 +247,7 @@ This function retrieves data from Zendesk Talk for phone calls and voicemails.
@dlt.source(max_table_nesting=2)
def zendesk_talk(
credentials: TZendeskCredentials = dlt.secrets.value,
- start_date: Optional[TAnyDateTime] = DEFAULT_START_DATE,
+ start_date: Optional[TAnyDateTime] = START_DATE,
end_date: Optional[TAnyDateTime] = None,
) -> Iterable[DltResource]:
...
@@ -335,9 +335,9 @@ verified source.
dev_mode=False,
dataset_name="sample_zendesk_data" # Use a custom name if desired
)
- data = zendesk_support(load_all=True, start_date=start_date)
- data_chat = zendesk_chat(start_date=start_date)
- data_talk = zendesk_talk(start_date=start_date)
+ data = zendesk_support(load_all=True, start_date=START_DATE)
+ data_chat = zendesk_chat(start_date=START_DATE)
+ data_talk = zendesk_talk(start_date=START_DATE)
info = pipeline.run(data=[data, data_chat, data_talk])
print(info)
```
@@ -354,7 +354,7 @@ verified source.
min_start_date = pendulum.DateTime(year=2023, month=1, day=1).in_timezone("UTC")
max_end_date = pendulum.today()
# Generate tuples of date ranges, each with 1 week in between.
- ranges = make_date_ranges(min_start_date, max_end_date, timedelta(weeks=1))
+ ranges = make_date_ranges(min_start_date, max_end_date, datetime.timedelta(weeks=1))
# Run the pipeline in a loop for each 1-week range
for start, end in ranges:
diff --git a/docs/website/docs/dlt-ecosystem/visualizations/exploring-the-data.md b/docs/website/docs/dlt-ecosystem/visualizations/exploring-the-data.md
index 2d7a7642c2..79ac7b89ad 100644
--- a/docs/website/docs/dlt-ecosystem/visualizations/exploring-the-data.md
+++ b/docs/website/docs/dlt-ecosystem/visualizations/exploring-the-data.md
@@ -64,9 +64,9 @@ pipeline = dlt.pipeline(
with pipeline.sql_client() as client:
with client.execute_query(
'SELECT "reactions__+1", "reactions__-1", reactions__laugh, reactions__hooray, reactions__rocket FROM issues'
- ) as table:
+ ) as cursor:
# calling `df` on a cursor, returns the data as a pandas DataFrame
- reactions = table.df()
+ reactions = cursor.df()
counts = reactions.sum(0).sort_values(0, ascending=False)
```
diff --git a/docs/website/docs/general-usage/credentials/advanced.md b/docs/website/docs/general-usage/credentials/advanced.md
index 2d7745eb44..2a350c6a56 100644
--- a/docs/website/docs/general-usage/credentials/advanced.md
+++ b/docs/website/docs/general-usage/credentials/advanced.md
@@ -42,7 +42,7 @@ keywords: [credentials, secrets.toml, secrets, config, configuration, environmen
def slack_source(
page_size: int = MAX_PAGE_SIZE,
access_token: str = dlt.secrets.value,
- start_date: Optional[TAnyDateTime] = DEFAULT_START_DATE
+ start_date: Optional[TAnyDateTime] = START_DATE
):
...
```
@@ -133,13 +133,13 @@ Additionally, you can store custom settings within the same configuration files.
```py
# use `dlt.secrets` and `dlt.config` to explicitly take
# those values from providers from the explicit keys
-data_source = google_sheets(
+source_instance = google_sheets(
dlt.config["sheet_id"],
dlt.config["my_section.tabs"],
dlt.secrets["my_section.gcp_credentials"]
)
-data_source.run(destination="bigquery")
+source_instance.run(destination="bigquery")
```
`dlt.config` and `dlt.secrets` behave like dictionaries from which you can request a value with any key name. `dlt` will look in all [config providers](setup) - environment variables, TOML files, etc. to create these dictionaries. You can also use `dlt.config.get()` or `dlt.secrets.get()` to
@@ -182,7 +182,7 @@ def google_sheets(
sheets = build('sheets', 'v4', credentials=ServiceAccountCredentials.from_service_account_info(credentials))
tabs = []
for tab_name in tab_names:
- data = get_sheet(sheets, spreadsheet_id, tab_name)
+ data = _get_sheet(sheets, spreadsheet_id, tab_name)
tabs.append(dlt.resource(data, name=tab_name))
return tabs
```
diff --git a/docs/website/docs/general-usage/credentials/complex_types.md b/docs/website/docs/general-usage/credentials/complex_types.md
index d14e031097..6414ff0a6a 100644
--- a/docs/website/docs/general-usage/credentials/complex_types.md
+++ b/docs/website/docs/general-usage/credentials/complex_types.md
@@ -58,9 +58,9 @@ dsn.password="loader"
You can explicitly provide credentials in various forms:
```py
-query("SELECT * FROM customers", "postgres://loader@localhost:5432/dlt_data")
+query("SELECT * FROM customers", "postgres://loader@localhost:5432/dlt_data") # type: ignore[arg-type]
# or
-query("SELECT * FROM customers", {"database": "dlt_data", "username": "loader"})
+query("SELECT * FROM customers", {"database": "dlt_data", "username": "loader"}) # type: ignore[arg-type]
```
## Built-in credentials
@@ -109,7 +109,7 @@ The `OAuth2Credentials` class handles OAuth 2.0 credentials, including client ID
Usage:
```py
-credentials = OAuth2Credentials(
+oauth_credentials = OAuth2Credentials(
client_id="CLIENT_ID",
client_secret="CLIENT_SECRET", # type: ignore
refresh_token="REFRESH_TOKEN", # type: ignore
@@ -117,10 +117,10 @@ credentials = OAuth2Credentials(
)
# Authorize the client
-credentials.auth()
+oauth_credentials.auth()
# Add additional scopes
-credentials.add_scopes(["scope3", "scope4"])
+oauth_credentials.add_scopes(["scope3", "scope4"])
```
`OAuth2Credentials` is a base class to implement actual OAuth; for example, it is a base class for [GcpOAuthCredentials](#gcpoauthcredentials).
@@ -146,18 +146,19 @@ The `GcpServiceAccountCredentials` class manages GCP Service Account credentials
- Or default credentials will be used.
```py
-credentials = GcpServiceAccountCredentials()
+gcp_credentials = GcpServiceAccountCredentials()
# Parse a native value (ServiceAccountCredentials)
# Accepts a native value, which can be either an instance of ServiceAccountCredentials
# or a serialized services.json.
# Parses the native value and updates the credentials.
-native_value = {"private_key": ".."} # or "path/to/services.json"
-credentials.parse_native_representation(native_value)
+gcp_native_value = {"private_key": ".."} # or "path/to/services.json"
+gcp_credentials.parse_native_representation(gcp_native_value)
```
or more preferred use:
```py
import dlt
from dlt.sources.credentials import GcpServiceAccountCredentials
+from google.analytics import BetaAnalyticsDataClient
@dlt.source
def google_analytics(
@@ -255,21 +256,21 @@ The `AwsCredentials` class is responsible for handling AWS credentials, includin
#### Usage
```py
-credentials = AwsCredentials()
+aws_credentials = AwsCredentials()
# Set the necessary attributes
-credentials.aws_access_key_id = "ACCESS_KEY_ID"
-credentials.aws_secret_access_key = "SECRET_ACCESS_KEY"
-credentials.region_name = "us-east-1"
+aws_credentials.aws_access_key_id = "ACCESS_KEY_ID"
+aws_credentials.aws_secret_access_key = "SECRET_ACCESS_KEY"
+aws_credentials.region_name = "us-east-1"
```
or
```py
# Imports an external botocore session and sets the credentials properties accordingly.
import botocore.session
-credentials = AwsCredentials()
+aws_credentials = AwsCredentials()
session = botocore.session.get_session()
-credentials.parse_native_representation(session)
-print(credentials.aws_access_key_id)
+aws_credentials.parse_native_representation(session)
+print(aws_credentials.aws_access_key_id)
```
or more preferred use:
```py
@@ -314,10 +315,10 @@ The `AzureCredentials` class is responsible for handling Azure Blob Storage cred
#### Usage
```py
-credentials = AzureCredentials()
+az_credentials = AzureCredentials()
# Set the necessary attributes
-credentials.azure_storage_account_name = "ACCOUNT_NAME"
-credentials.azure_storage_account_key = "ACCOUNT_KEY"
+az_credentials.azure_storage_account_name = "ACCOUNT_NAME"
+az_credentials.azure_storage_account_key = "ACCOUNT_KEY"
```
or more preferred use:
```py
@@ -364,7 +365,7 @@ Example:
@dlt.source
def zen_source(credentials: Union[ZenApiKeyCredentials, ZenEmailCredentials, str] = dlt.secrets.value, some_option: bool = False):
# Depending on what the user provides in config, ZenApiKeyCredentials or ZenEmailCredentials will be injected into the `credentials` argument. Both classes implement `auth` so you can always call it.
- credentials.auth()
+ credentials.auth() # type: ignore[union-attr]
return dlt.resource([credentials], name="credentials")
# Pass native value
diff --git a/docs/website/docs/general-usage/credentials/setup.md b/docs/website/docs/general-usage/credentials/setup.md
index 709cf09fe8..b8f60e8156 100644
--- a/docs/website/docs/general-usage/credentials/setup.md
+++ b/docs/website/docs/general-usage/credentials/setup.md
@@ -270,7 +270,7 @@ dlt.config.register_provider(provider)
```
:::tip
-Check out an [example](../../examples/custom_config_provider) for a `yaml` based config provider that supports switchable profiles.
+Check out an [example](../../examples/custom_config_provider) for a YAML based config provider that supports switchable profiles.
:::
## Examples
diff --git a/docs/website/docs/general-usage/customising-pipelines/pseudonymizing_columns.md b/docs/website/docs/general-usage/customising-pipelines/pseudonymizing_columns.md
index a5bee3251c..d07056ff40 100644
--- a/docs/website/docs/general-usage/customising-pipelines/pseudonymizing_columns.md
+++ b/docs/website/docs/general-usage/customising-pipelines/pseudonymizing_columns.md
@@ -46,14 +46,14 @@ for row in dummy_source().dummy_data.add_map(pseudonymize_name):
# Or create an instance of the data source, modify the resource and run the source.
# 1. Create an instance of the source so you can edit it.
-data_source = dummy_source()
+source_instance = dummy_source()
# 2. Modify this source instance's resource
-data_resource = data_source.dummy_data.add_map(pseudonymize_name)
+data_resource = source_instance.dummy_data.add_map(pseudonymize_name)
# 3. Inspect your result
-for row in data_resource:
+for row in source_instance:
print(row)
pipeline = dlt.pipeline(pipeline_name='example', destination='bigquery', dataset_name='normalized_data')
-load_info = pipeline.run(data_resource)
+load_info = pipeline.run(source_instance)
```
diff --git a/docs/website/docs/general-usage/customising-pipelines/removing_columns.md b/docs/website/docs/general-usage/customising-pipelines/removing_columns.md
index 3d2d389c15..946f6031f8 100644
--- a/docs/website/docs/general-usage/customising-pipelines/removing_columns.md
+++ b/docs/website/docs/general-usage/customising-pipelines/removing_columns.md
@@ -56,17 +56,17 @@ Let's create a sample pipeline demonstrating the process of removing a column.
remove_columns_list = ["country_code"]
# Create an instance of the source so you can edit it.
- data_source = dummy_source()
+ source_instance = dummy_source()
# Modify this source instance's resource
- data_source = data_source.dummy_data.add_map(
+ source_instance.dummy_data.add_map(
lambda doc: remove_columns(doc, remove_columns_list)
)
```
4. You can optionally inspect the result:
```py
- for row in data_source:
+ for row in source_instance:
print(row)
#{'id': 0, 'name': 'Jane Washington 0'}
#{'id': 1, 'name': 'Jane Washington 1'}
diff --git a/docs/website/docs/general-usage/customising-pipelines/renaming_columns.md b/docs/website/docs/general-usage/customising-pipelines/renaming_columns.md
index eca42f50df..edcacffacc 100644
--- a/docs/website/docs/general-usage/customising-pipelines/renaming_columns.md
+++ b/docs/website/docs/general-usage/customising-pipelines/renaming_columns.md
@@ -39,13 +39,13 @@ def replace_umlauts_in_dict_keys(d):
# We can add the map function to the resource
# 1. Create an instance of the source so you can edit it.
-data_source = dummy_source()
+source_instance = dummy_source()
# 2. Modify this source instance's resource
-data_resource = data_source.dummy_data().add_map(replace_umlauts_in_dict_keys)
+source_instance.dummy_data().add_map(replace_umlauts_in_dict_keys)
# 3. Inspect your result
-for row in data_resource:
+for row in source_instance:
print(row)
# {'Objekt_0': {'Groesse': 0, 'Aequivalenzpruefung': True}}
diff --git a/docs/website/docs/general-usage/data-enrichments/currency_conversion_data_enrichment.md b/docs/website/docs/general-usage/data-enrichments/currency_conversion_data_enrichment.md
index 4740eafa4b..881868cd0e 100644
--- a/docs/website/docs/general-usage/data-enrichments/currency_conversion_data_enrichment.md
+++ b/docs/website/docs/general-usage/data-enrichments/currency_conversion_data_enrichment.md
@@ -113,7 +113,7 @@ API token.
information securely, like access tokens. Keep this file safe. Here's its format for service
account authentication:
- ```py
+ ```toml
[sources]
api_key= "Please set me up!" # ExchangeRate-API key
```
@@ -147,26 +147,26 @@ API token.
target_currency = "EUR"
# Retrieve the API key from DLT secrets
- api_key = dlt.secrets.get("sources.api_key")
+ api_key: str = dlt.secrets.get("sources.api_key")
# Initialize or retrieve the state for currency rates
rates_state = dlt.current.resource_state().setdefault("rates", {})
currency_pair_key = f"{base_currency}-{target_currency}"
currency_pair_state = rates_state.setdefault(currency_pair_key, {
- "last_update": datetime.min,
+ "last_update": datetime.datetime.min,
"rate": None
})
# Update the exchange rate if it's older than 12 hours
if (currency_pair_state.get("rate") is None or
- (datetime.utcnow() - currency_pair_state["last_update"] >= timedelta(hours=12))):
+ (datetime.datetime.utcnow() - currency_pair_state["last_update"] >= datetime.timedelta(hours=12))):
url = f"https://v6.exchangerate-api.com/v6/{api_key}/pair/{base_currency}/{target_currency}"
response = requests.get(url)
if response.status_code == 200:
data = response.json()
currency_pair_state.update({
"rate": data.get("conversion_rate"),
- "last_update": datetime.fromtimestamp(data.get("time_last_update_unix"))
+ "last_update": datetime.datetime.fromtimestamp(data.get("time_last_update_unix"))
})
print(f"The latest rate of {data.get('conversion_rate')} for the currency pair {currency_pair_key} is fetched and updated.")
else:
diff --git a/docs/website/docs/general-usage/data-enrichments/user_agent_device_data_enrichment.md b/docs/website/docs/general-usage/data-enrichments/user_agent_device_data_enrichment.md
index 958ca3cbe0..2ef9533d5f 100644
--- a/docs/website/docs/general-usage/data-enrichments/user_agent_device_data_enrichment.md
+++ b/docs/website/docs/general-usage/data-enrichments/user_agent_device_data_enrichment.md
@@ -102,7 +102,7 @@ The first step is to register on [SerpAPI](https://serpapi.com/) and obtain the
1. In the `.dlt` folder, there's a file called `secrets.toml`. It's where you store sensitive information securely, like access tokens. Keep this file safe. Here's its format for service account authentication:
- ```py
+ ```toml
[sources]
api_key= "Please set me up!" # Serp Api key.
```
@@ -111,7 +111,6 @@ The first step is to register on [SerpAPI](https://serpapi.com/) and obtain the
1. Create the `fetch_average_price()` function as follows:
```py
- from datetime import datetime, timedelta
import requests
# Uncomment transformer function if it is to be used as a transformer,
@@ -138,13 +137,13 @@ The first step is to register on [SerpAPI](https://serpapi.com/) and obtain the
"""
# Retrieve the API key from dlt secrets
- api_key = dlt.secrets.get("sources.api_key")
+ api_key: str = dlt.secrets.get("sources.api_key")
# Get the current resource state for device information
device_info = dlt.current.resource_state().setdefault("devices", {})
# Current timestamp for checking the last update
- current_timestamp = datetime.now()
+ current_timestamp = datetime.datetime.now()
# Print the current device information
# print(device_info) # if you need to check state
@@ -156,10 +155,10 @@ The first step is to register on [SerpAPI](https://serpapi.com/) and obtain the
# Calculate the time since the last update
last_updated = (
current_timestamp -
- device_data.get('timestamp', datetime.min)
+ device_data.get('timestamp', datetime.datetime.min)
)
# Check if the device is not in state or data is older than 180 days
- if device not in device_info or last_updated > timedelta(days=180):
+ if device not in device_info or last_updated > datetime.timedelta(days=180):
try:
# Make an API request to fetch device prices
response = requests.get("https://serpapi.com/search", params={
diff --git a/docs/website/docs/general-usage/full-loading.md b/docs/website/docs/general-usage/full-loading.md
index b252fbef92..d1a3cf6df3 100644
--- a/docs/website/docs/general-usage/full-loading.md
+++ b/docs/website/docs/general-usage/full-loading.md
@@ -17,7 +17,7 @@ issues = []
reactions = ["%2B1", "-1", "smile", "tada", "thinking_face", "heart", "rocket", "eyes"]
for reaction in reactions:
for page_no in range(1, 3):
- page = requests.get(f"https://api.github.com/repos/{repo}/issues?state=all&sort=reactions-{reaction}&per_page=100&page={page_no}", headers=headers)
+ page = requests.get(f"https://api.github.com/repos/{REPO_NAME}/issues?state=all&sort=reactions-{reaction}&per_page=100&page={page_no}", headers=headers)
print(f"Got page for {reaction} page {page_no}, requests left", page.headers["x-ratelimit-remaining"])
issues.extend(page.json())
p.run(issues, write_disposition="replace", primary_key="id", table_name="issues")
diff --git a/docs/website/docs/general-usage/http/requests.md b/docs/website/docs/general-usage/http/requests.md
index cf7711cdd7..b0b7a03d16 100644
--- a/docs/website/docs/general-usage/http/requests.md
+++ b/docs/website/docs/general-usage/http/requests.md
@@ -27,7 +27,7 @@ And use it just like you would use `requests`:
```py
response = requests.get(
'https://example.com/api/contacts',
- headers={'Authorization': MY_API_KEY}
+ headers={'Authorization': API_KEY}
)
data = response.json()
...
diff --git a/docs/website/docs/general-usage/http/rest-client.md b/docs/website/docs/general-usage/http/rest-client.md
index b8ad3830d2..53e1a2b7c5 100644
--- a/docs/website/docs/general-usage/http/rest-client.md
+++ b/docs/website/docs/general-usage/http/rest-client.md
@@ -546,7 +546,7 @@ class OAuth2ClientCredentialsHTTPBasic(OAuth2ClientCredentials):
"data": self.access_token_request_data,
}
-auth = OAuth2ClientCredentialsHTTPBasic(
+oauth = OAuth2ClientCredentialsHTTPBasic(
access_token_url=dlt.secrets["sources.zoom.access_token_url"], # "https://zoom.us/oauth/token"
client_id=dlt.secrets["sources.zoom.client_id"],
client_secret=dlt.secrets["sources.zoom.client_secret"],
@@ -555,7 +555,7 @@ auth = OAuth2ClientCredentialsHTTPBasic(
"account_id": dlt.secrets["sources.zoom.account_id"],
},
)
-client = RESTClient(base_url="https://api.zoom.us/v2", auth=auth)
+client = RESTClient(base_url="https://api.zoom.us/v2", auth=oauth)
response = client.get("/users")
```
@@ -593,7 +593,7 @@ client = RESTClient(
`RESTClient.paginate()` allows you to specify a [custom hook function](https://requests.readthedocs.io/en/latest/user/advanced/#event-hooks) that can be used to modify the response objects. For example, to handle specific HTTP status codes gracefully:
```py
-def custom_response_handler(response):
+def custom_response_handler(response, *args):
if response.status_code == 404:
# Handle not found
pass
@@ -680,7 +680,7 @@ for page in client.paginate("/posts"):
```py
from dlt.sources.helpers.rest_client.auth import BearerTokenAuth
-def response_hook(response, **kwargs):
+def response_hook(response, *args):
print(response.status_code)
print(f"Content: {response.content}")
print(f"Request: {response.request.body}")
diff --git a/docs/website/docs/general-usage/incremental-loading.md b/docs/website/docs/general-usage/incremental-loading.md
index c8f92cf154..3f452f0d16 100644
--- a/docs/website/docs/general-usage/incremental-loading.md
+++ b/docs/website/docs/general-usage/incremental-loading.md
@@ -100,7 +100,7 @@ issues = []
reactions = ["%2B1", "-1", "smile", "tada", "thinking_face", "heart", "rocket", "eyes"]
for reaction in reactions:
for page_no in range(1, 3):
- page = requests.get(f"https://api.github.com/repos/{repo}/issues?state=all&sort=reactions-{reaction}&per_page=100&page={page_no}", headers=headers)
+ page = requests.get(f"https://api.github.com/repos/{REPO_NAME}/issues?state=all&sort=reactions-{reaction}&per_page=100&page={page_no}", headers=headers)
print(f"got page for {reaction} page {page_no}, requests left", page.headers["x-ratelimit-remaining"])
issues.extend(page.json())
p.run(issues, write_disposition="merge", primary_key="id", table_name="issues")
@@ -655,30 +655,6 @@ def get_events(last_created_at = dlt.sources.incremental("$", last_value_func=by
yield json.load(f)
```
-### Using `last_value_func` for lookback
-The example below uses the `last_value_func` to load data from the past month.
-```py
-def lookback(event):
- last_value = None
- if len(event) == 1:
- item, = event
- else:
- item, last_value = event
-
- if last_value is None:
- last_value = {}
- else:
- last_value = dict(last_value)
-
- last_value["created_at"] = pendulum.from_timestamp(item["created_at"]).subtract(months=1)
- return last_value
-
-@dlt.resource(primary_key="id")
-def get_events(last_created_at = dlt.sources.incremental("created_at", last_value_func=lookback)):
- with open("tests/normalize/cases/github.events.load_page_1_duck.json", "r", encoding="utf-8") as f:
- yield json.load(f)
-```
-
### Using `end_value` for backfill
You can specify both initial and end dates when defining incremental loading. Let's go back to our Github example:
@@ -807,7 +783,7 @@ def tickets(
`Incremental` **does not** deduplicate datasets like the **merge** write disposition does. However, it ensures that when another portion of data is extracted, records that were previously loaded won't be included again. `dlt` assumes that you load a range of data, where the lower bound is inclusive (i.e., greater than or equal). This ensures that you never lose any data but will also re-acquire some rows. For example, if you have a database table with a cursor field on `updated_at` which has a day resolution, then there's a high chance that after you extract data on a given day, more records will still be added. When you extract on the next day, you should reacquire data from the last day to ensure all records are present; however, this will create overlap with data from the previous extract.
-By default, a content hash (a hash of the `json` representation of a row) will be used to deduplicate. This may be slow, so `dlt.sources.incremental` will inherit the primary key that is set on the resource. You can optionally set a `primary_key` that is used exclusively to deduplicate and which does not become a table hint. The same setting lets you disable the deduplication altogether when an empty tuple is passed. Below, we pass `primary_key` directly to `incremental` to disable deduplication. That overrides the `delta` primary_key set in the resource:
+By default, a content hash (a hash of the JSON representation of a row) will be used to deduplicate. This may be slow, so `dlt.sources.incremental` will inherit the primary key that is set on the resource. You can optionally set a `primary_key` that is used exclusively to deduplicate and which does not become a table hint. The same setting lets you disable the deduplication altogether when an empty tuple is passed. Below, we pass `primary_key` directly to `incremental` to disable deduplication. That overrides the `delta` primary_key set in the resource:
```py
@dlt.resource(primary_key="delta")
@@ -826,11 +802,10 @@ When resources are [created dynamically](source.md#create-resources-dynamically)
def stripe():
# declare a generator function
def get_resource(
- endpoint: Endpoints,
+ endpoints: List[str] = ENDPOINTS,
created: dlt.sources.incremental=dlt.sources.incremental("created")
):
...
- yield data
# create resources for several endpoints on a single decorator function
for endpoint in endpoints:
@@ -878,10 +853,12 @@ We opt-in to the Airflow scheduler by setting `allow_external_schedulers` to `Tr
Let's generate a deployment with `dlt deploy zendesk_pipeline.py airflow-composer` and customize the DAG:
```py
+from dlt.helpers.airflow_helper import PipelineTasksGroup
+
@dag(
schedule_interval='@weekly',
- start_date=pendulum.datetime(2023, 2, 1),
- end_date=pendulum.datetime(2023, 8, 1),
+ start_date=pendulum.DateTime(2023, 2, 1),
+ end_date=pendulum.DateTime(2023, 8, 1),
catchup=True,
max_active_runs=1,
default_args=default_task_args
@@ -918,7 +895,7 @@ You can repurpose the DAG above to start loading new data incrementally after (o
```py
@dag(
schedule_interval='@daily',
- start_date=pendulum.datetime(2023, 2, 1),
+ start_date=pendulum.DateTime(2023, 2, 1),
catchup=False,
max_active_runs=1,
default_args=default_task_args
@@ -1087,6 +1064,65 @@ result_filtered = list(without_none)
assert len(result_filtered) == 2
```
+## Lag / Attribution Window
+In many cases, certain data should be reacquired during incremental loading. For example, you may want to always capture the last 7 days of data when fetching daily analytics reports, or refresh Slack message replies with a moving window of 7 days. This is where the concept of "lag" or "attribution window" comes into play.
+
+The `lag` parameter is a float that supports several types of incremental cursors: `datetime`, `date`, `integer`, and `float`. It can only be used with `last_value_func` set to `min` or `max` (default is `max`).
+
+### How `lag` Works
+
+- **Datetime cursors**: `lag` is the number of seconds added or subtracted from the `last_value` loaded.
+- **Date cursors**: `lag` represents days.
+- **Numeric cursors (integer or float)**: `lag` respects the given unit of the cursor.
+
+This flexibility allows `lag` to adapt to different data contexts.
+
+
+### Example using `datetime` incremental cursor with `merge` as `write_disposition`
+
+This example demonstrates how to use a `datetime` cursor with a `lag` parameter, applying `merge` as the `write_disposition`. The setup runs twice, and during the second run, the `lag` parameter re-fetches recent entries to capture updates.
+
+1. **First Run**: Loads `initial_entries`.
+2. **Second Run**: Loads `second_run_events` with the specified lag, refreshing previously loaded entries.
+
+This setup demonstrates how `lag` ensures that a defined period of data remains refreshed, capturing updates or changes within the attribution window.
+
+```py
+pipeline = dlt.pipeline(
+ destination=dlt.destinations.duckdb(credentials=duckdb.connect(":memory:")),
+)
+
+# Flag to indicate the second run
+is_second_run = False
+
+@dlt.resource(name="events", primary_key="id", write_disposition="merge")
+def events_resource(
+ _=dlt.sources.incremental("created_at", lag=3600, last_value_func=max)
+):
+ global is_second_run
+
+ # Data for the initial run
+ initial_entries = [
+ {"id": 1, "created_at": "2023-03-03T01:00:00Z", "event": "1"},
+ {"id": 2, "created_at": "2023-03-03T02:00:00Z", "event": "2"}, # lag applied during second run
+ ]
+
+ # Data for the second run
+ second_run_events = [
+ {"id": 1, "created_at": "2023-03-03T01:00:00Z", "event": "1_updated"},
+ {"id": 2, "created_at": "2023-03-03T02:00:01Z", "event": "2_updated"},
+ {"id": 3, "created_at": "2023-03-03T03:00:00Z", "event": "3"},
+ ]
+
+ # Yield data based on the current run
+ yield from second_run_events if is_second_run else initial_entries
+
+# Run the pipeline twice
+pipeline.run(events_resource)
+is_second_run = True # Update flag for second run
+pipeline.run(events_resource)
+```
+
## Doing a full refresh
@@ -1132,7 +1168,7 @@ def tweets():
# Get the last value from loaded metadata. If it does not exist, get None
last_val = dlt.current.resource_state().setdefault("last_updated", None)
# Get data and yield it
- data = get_data(start_from=last_val)
+ data = _get_data(start_from=last_val)
yield data
# Change the state to the new value
dlt.current.resource_state()["last_updated"] = data["last_timestamp"]
@@ -1183,7 +1219,7 @@ def players_games(chess_url, players, start_month=None, end_month=None):
# when the data is loaded, the cache is updated with our loaded_archives_cache
# Get archives for a given player
- archives = get_players_archives(chess_url, players)
+ archives = _get_players_archives(chess_url, players)
for url in archives:
# If not in cache, yield the data and cache the URL
if url not in loaded_archives_cache:
@@ -1208,7 +1244,7 @@ def search_tweets(twitter_bearer_token=dlt.secrets.value, search_terms=None, sta
print(f'last_value_cache: {last_value_cache}')
params = {...}
url = "https://api.twitter.com/2/tweets/search/recent"
- response = _paginated_get(url, headers=headers, params=params)
+ response = _get_paginated(url, headers=headers, params=params)
for page in response:
page['search_term'] = search_term
last_id = page.get('meta', {}).get('newest_id', 0)
diff --git a/docs/website/docs/general-usage/pipeline.md b/docs/website/docs/general-usage/pipeline.md
index 0a159af910..1fd4db3a7e 100644
--- a/docs/website/docs/general-usage/pipeline.md
+++ b/docs/website/docs/general-usage/pipeline.md
@@ -34,8 +34,10 @@ You instantiate a pipeline by calling the `dlt.pipeline` function with the follo
will load the data. It may also be provided to the `run` method of the `pipeline`.
- `dataset_name`: a name of the dataset to which the data will be loaded. A dataset is a logical
group of tables, i.e., `schema` in relational databases or a folder grouping many files. It may also be
- provided later to the `run` or `load` methods of the pipeline. If not provided at all, then
- it defaults to the `pipeline_name`.
+ provided later to the `run` or `load` methods of the pipeline. If not provided, then
+ it defaults to the `{pipeline_name}_dataset` on destinations that require datasets (most of the warehouses).
+ It will stay empty on destinations that do not separate tables into datasets (or database schemas) ie.
+ on vector databases or Clikchouse.
To load the data, you call the `run` method and pass your data in the `data` argument.
@@ -213,6 +215,8 @@ pipeline = dlt.pipeline(
You can fully configure the progress monitor. See two examples below:
```py
+from airflow.operators.python import get_current_context # noqa
+
# log each minute to Airflow task logger
ti = get_current_context()["ti"]
pipeline = dlt.pipeline(
diff --git a/docs/website/docs/general-usage/resource.md b/docs/website/docs/general-usage/resource.md
index 579452cc0c..199eaf9b5d 100644
--- a/docs/website/docs/general-usage/resource.md
+++ b/docs/website/docs/general-usage/resource.md
@@ -197,17 +197,17 @@ def users_details(user_item):
for detail in _get_details(user_item["user_id"]):
yield detail
-# Just load the user_details.
+# Just load the users_details.
# dlt figures out dependencies for you.
-pipeline.run(user_details)
+pipeline.run(users_details)
```
-In the example above, `user_details` will receive data from the default instance of the `users` resource (with `limit` set to `None`). You can also use the **pipe |** operator to bind resources dynamically.
+In the example above, `users_details` will receive data from the default instance of the `users` resource (with `limit` set to `None`). You can also use the **pipe |** operator to bind resources dynamically.
```py
# You can be more explicit and use a pipe operator.
# With it, you can create dynamic pipelines where the dependencies
# are set at run time and resources are parametrized, i.e.,
# below we want to load only 100 users from the `users` endpoint.
-pipeline.run(users(limit=100) | user_details)
+pipeline.run(users(limit=100) | users_details)
```
:::tip
@@ -232,12 +232,12 @@ print(list([1,2] | pokemon()))
A standalone resource is defined on a function that is top-level in a module (not an inner function) that accepts config and secrets values. Additionally, if the `standalone` flag is specified, the decorated function signature and docstring will be preserved. `dlt.resource` will just wrap the decorated function, and the user must call the wrapper to get the actual resource. Below we declare a `filesystem` resource that must be called before use.
```py
@dlt.resource(standalone=True)
-def filesystem(bucket_url=dlt.config.value):
+def fs_resource(bucket_url=dlt.config.value):
"""List and yield files in `bucket_url`."""
...
# `filesystem` must be called before it is extracted or used in any other way.
-pipeline.run(filesystem("s3://my-bucket/reports"), table_name="reports")
+pipeline.run(fs_resource("s3://my-bucket/reports"), table_name="reports")
```
Standalone may have a dynamic name that depends on the arguments passed to the decorated function. For example:
@@ -306,7 +306,7 @@ import dlt
@dlt.resource(write_disposition="replace")
def users():
...
- users = requests.get(...)
+ users = requests.get(RESOURCE_URL)
...
yield users
```
@@ -317,8 +317,8 @@ Here's our script that defines transformations and loads the data:
from pipedrive import users
def anonymize_user(user_data):
- user_data["user_id"] = hash_str(user_data["user_id"])
- user_data["user_email"] = hash_str(user_data["user_email"])
+ user_data["user_id"] = _hash_str(user_data["user_id"])
+ user_data["user_email"] = _hash_str(user_data["user_email"])
return user_data
# add the filter and anonymize function to users resource and enumerate
@@ -372,8 +372,6 @@ tables of a nested table). Typical settings:
You can achieve the same effect after the resource instance is created:
```py
-from my_resource import my_awesome_module
-
resource = my_resource()
resource.max_table_nesting = 0
```
@@ -454,11 +452,11 @@ def sql_table(credentials, schema, table):
for idx, batch in enumerate(table_rows(engine, table_obj)):
if idx == 0:
- # Emit the first row with hints, table_to_columns and get_primary_key are helpers that extract dlt schema from
+ # Emit the first row with hints, table_to_columns and _get_primary_key are helpers that extract dlt schema from
# SqlAlchemy model
yield dlt.mark.with_hints(
batch,
- dlt.mark.make_hints(columns=table_to_columns(table_obj), primary_key=get_primary_key(table_obj)),
+ dlt.mark.make_hints(columns=table_to_columns(table_obj), primary_key=_get_primary_key(table_obj)),
)
else:
# Just yield all the other rows
@@ -477,8 +475,7 @@ You can import external files, i.e., CSV, Parquet, and JSONL, by yielding items
```py
import os
import dlt
-
-from filesystem import filesystem
+from dlt.sources.filesystem import filesystem
columns: List[TColumnSchema] = [
{"name": "id", "data_type": "bigint"},
@@ -501,7 +498,8 @@ def orders(items: Iterator[FileItemDict]):
yield dlt.mark.with_file_import(dest_file, "csv")
-# use the filesystem verified source to glob a bucket
+# use the filesystem core source to glob a bucket
+
downloader = filesystem(
bucket_url="s3://my_bucket/csv",
file_glob="today/*.csv.gz") | orders
@@ -527,9 +525,10 @@ You can sniff the schema from the data, i.e., using DuckDB to infer the table sc
### Duplicate and rename resources
There are cases when your resources are generic (i.e., bucket filesystem) and you want to load several instances of it (i.e., files from different folders) into separate tables. In the example below, we use the `filesystem` source to load csvs from two different folders into separate tables:
+
```py
@dlt.resource(standalone=True)
-def filesystem(bucket_url):
+def fs_resource(bucket_url):
# list and yield files in bucket_url
...
@@ -540,8 +539,8 @@ def csv_reader(file_item):
# create two extract pipes that list files from the bucket and send them to the reader.
# by default, both pipes will load data to the same table (csv_reader)
-reports_pipe = filesystem("s3://my-bucket/reports") | load_csv()
-transactions_pipe = filesystem("s3://my-bucket/transactions") | load_csv()
+reports_pipe = fs_resource("s3://my-bucket/reports") | csv_reader()
+transactions_pipe = fs_resource("s3://my-bucket/transactions") | csv_reader()
# so we rename resources to load to "reports" and "transactions" tables
pipeline.run(
@@ -582,7 +581,7 @@ def generate_rows(nr):
for i in range(nr):
yield {'id': i, 'example_string': 'abc'}
```
-The resource above will be saved and loaded from a `parquet` file (if the destination supports it).
+The resource above will be saved and loaded from a Parquet file (if the destination supports it).
:::note
A special `file_format`: **preferred** will load the resource using a format that is preferred by a destination. This setting supersedes the `loader_file_format` passed to the `run` method.
diff --git a/docs/website/docs/general-usage/schema-contracts.md b/docs/website/docs/general-usage/schema-contracts.md
index 6c557f2c45..d7e1627f7f 100644
--- a/docs/website/docs/general-usage/schema-contracts.md
+++ b/docs/website/docs/general-usage/schema-contracts.md
@@ -105,7 +105,7 @@ As a consequence, `discard_row` will drop the whole data item - even if a nested
### Set contracts on Arrow tables and Pandas
-All contract settings apply to [arrow tables and panda frames](../dlt-ecosystem/verified-sources/arrow-pandas.md) as well.
+All contract settings apply to [Arrow tables and pandas frames](../dlt-ecosystem/verified-sources/arrow-pandas.md) as well.
1. **tables** mode is the same - no matter what the data item type is.
2. **columns** will allow new columns, raise an exception, or modify tables/frames still in the extract step to avoid rewriting Parquet files.
3. **data_type** changes to data types in tables/frames are not allowed and will result in a data type schema clash. We could allow for more modes (evolving data types in Arrow tables sounds weird but ping us on Slack if you need it.)
@@ -181,7 +181,7 @@ def items():
The below code will raise an error on any encountered schema change. Note: You can always set a string which will be interpreted as though all keys are set to these values.
```py
-pipeline.run(my_source(), schema_contract="freeze")
+pipeline.run(my_source, schema_contract="freeze")
```
The below code defines some settings on the source which can be overwritten on the resource, which in turn can be overwritten by the global override on the `run` method.
@@ -198,15 +198,15 @@ def other_items():
...
@dlt.source(schema_contract={"columns": "freeze", "data_type": "freeze"})
-def source():
+def frozen_source():
return [items(), other_items()]
# this will use the settings defined by the decorators
-pipeline.run(source())
+pipeline.run(frozen_source())
# this will freeze the whole schema, regardless of the decorator settings
-pipeline.run(source(), schema_contract="freeze")
+pipeline.run(frozen_source(), schema_contract="freeze")
```
diff --git a/docs/website/docs/general-usage/schema-evolution.md b/docs/website/docs/general-usage/schema-evolution.md
index 7b50ea139d..6ef638886d 100644
--- a/docs/website/docs/general-usage/schema-evolution.md
+++ b/docs/website/docs/general-usage/schema-evolution.md
@@ -119,7 +119,7 @@ from dlt.common.runtime.slack import send_slack_message
hook = "https://hooks.slack.com/services/xxx/xxx/xxx"
# Iterate over each package in the load_info object
-for package in info.load_packages:
+for package in load_info.load_packages:
# Iterate over each table in the schema_update of the current package
for table_name, table in package.schema_update.items():
# Iterate over each column in the current table
diff --git a/docs/website/docs/general-usage/schema.md b/docs/website/docs/general-usage/schema.md
index 2903221a36..13084d2c1a 100644
--- a/docs/website/docs/general-usage/schema.md
+++ b/docs/website/docs/general-usage/schema.md
@@ -295,9 +295,11 @@ settings:
```
Above, we add a `partition` hint to all columns ending with `_timestamp`. You can do the same thing in the code:
```py
+ from dlt.common.schema.typing import TSimpleRegex
+
source = data_source()
# this will update existing hints with the hints passed
- source.schema.merge_hints({"partition": ["re:_timestamp$"]})
+ source.schema.merge_hints({"partition": [TSimpleRegex("re:_timestamp$")]})
```
### Preferred data types
@@ -321,10 +323,10 @@ Here's the same thing in code:
source = data_source()
source.schema.update_preferred_types(
{
- "re:timestamp": "timestamp",
- "inserted_at": "timestamp",
- "created_at": "timestamp",
- "updated_at": "timestamp",
+ TSimpleRegex("re:timestamp"): "timestamp",
+ TSimpleRegex("inserted_at"): "timestamp",
+ TSimpleRegex("created_at"): "timestamp",
+ TSimpleRegex("updated_at"): "timestamp",
}
)
```
@@ -335,6 +337,7 @@ Here's the same thing in code:
Directly define data types and their properties, such as nullability, within the `@dlt.resource` decorator. This eliminates the dependency on external schema files. For example:
```py
+
@dlt.resource(name='my_table', columns={"my_column": {"data_type": "bool", "nullable": True}})
def my_resource():
for i in range(10):
@@ -388,7 +391,7 @@ This will display a structured YAML representation of your schema, showing detai
## Export and import schema files
-Please follow the guide on [how to adjust a schema](../walkthroughs/adjust-a-schema.md) to export and import `yaml`
+Please follow the guide on [how to adjust a schema](../walkthroughs/adjust-a-schema.md) to export and import YAML
schema files in your pipeline.
## Attaching schemas to sources
diff --git a/docs/website/docs/general-usage/source.md b/docs/website/docs/general-usage/source.md
index f91eca58de..a5f1f04dee 100644
--- a/docs/website/docs/general-usage/source.md
+++ b/docs/website/docs/general-usage/source.md
@@ -187,8 +187,7 @@ def my_resource():
```
or
```py
-my_source = source()
-my_source.my_resource.max_table_nesting = 0
+source.my_resource.max_table_nesting = 0
```
### Modify schema
diff --git a/docs/website/docs/general-usage/state.md b/docs/website/docs/general-usage/state.md
index db742c20b5..46aa1d63ce 100644
--- a/docs/website/docs/general-usage/state.md
+++ b/docs/website/docs/general-usage/state.md
@@ -21,7 +21,7 @@ def players_games(chess_url, player, start_month=None, end_month=None):
# create or request a list of archives from resource-scoped state
checked_archives = dlt.current.resource_state().setdefault("archives", [])
# get a list of archives for a particular player
- archives = player_archives(chess_url, player)
+ archives = _get_players_archives(chess_url, player)
for url in archives:
if url in checked_archives:
print(f"skipping archive {url}")
diff --git a/docs/website/docs/intro.md b/docs/website/docs/intro.md
index 650c47920b..76e3a34736 100644
--- a/docs/website/docs/intro.md
+++ b/docs/website/docs/intro.md
@@ -104,7 +104,7 @@ The [Filesystem](./tutorial/filesystem) source extracts data from AWS S3, Google
```py
from dlt.sources.filesystem import filesystem
-source = filesystem(
+resource = filesystem(
bucket_url="s3://example-bucket",
file_glob="*.csv"
)
@@ -115,7 +115,7 @@ pipeline = dlt.pipeline(
dataset_name="filesystem_data",
)
-load_info = pipeline.run(source)
+load_info = pipeline.run(resource)
```
Follow the [filesystem source tutorial](./tutorial/filesystem) to learn more about the source configuration and supported storage services.
diff --git a/docs/website/docs/reference/frequently-asked-questions.md b/docs/website/docs/reference/frequently-asked-questions.md
index 6cae330845..7106c8c9b6 100644
--- a/docs/website/docs/reference/frequently-asked-questions.md
+++ b/docs/website/docs/reference/frequently-asked-questions.md
@@ -13,8 +13,7 @@ If certain columns should not be normalized, you can mark them as `json`. This c
1. When fetching the source data.
```py
- source_data = my_source()
- source_data.resource3.apply_hints(
+ my_source.resource3.apply_hints(
columns={
"column_name": {
"data_type": "json"
@@ -61,7 +60,7 @@ p = dlt.pipeline(
)
# Extract data using the predefined source `my_source`
-p.extract(my_source().add_limit(10))
+p.extract(my_source.add_limit(10))
# Normalize the data structure for consistency
p.normalize()
diff --git a/docs/website/docs/reference/performance.md b/docs/website/docs/reference/performance.md
index 6c542fec8c..ab171ac069 100644
--- a/docs/website/docs/reference/performance.md
+++ b/docs/website/docs/reference/performance.md
@@ -36,7 +36,7 @@ processing.
### Controlling intermediary file size and rotation
`dlt` writes data to intermediary files. You can control the file size and the number of created files by setting the maximum number of data items stored in a single file or the maximum single file size. Keep in mind that the file size is computed after compression has been performed.
-* `dlt` uses a custom version of the [`jsonl` file format](../dlt-ecosystem/file-formats/jsonl.md) between the **extract** and **normalize** stages.
+* `dlt` uses a custom version of the [JSON file format](../dlt-ecosystem/file-formats/jsonl.md) between the **extract** and **normalize** stages.
* Files created between the **normalize** and **load** stages are the same files that will be loaded to the destination.
:::tip
diff --git a/docs/website/docs/running-in-production/alerting.md b/docs/website/docs/running-in-production/alerting.md
index 9e14399d4f..dd34949678 100644
--- a/docs/website/docs/running-in-production/alerting.md
+++ b/docs/website/docs/running-in-production/alerting.md
@@ -50,7 +50,7 @@ from dlt.common.runtime.slack import send_slack_message
hook = "https://hooks.slack.com/services/xxx/xxx/xxx"
# Iterate over each package in the load_info object
-for package in info.load_packages:
+for package in load_info.load_packages:
# Iterate over each table in the schema_update of the current package
for table_name, table in package.schema_update.items():
# Iterate over each column in the current table
diff --git a/docs/website/docs/running-in-production/running.md b/docs/website/docs/running-in-production/running.md
index d0563ec7de..9756e4ac16 100644
--- a/docs/website/docs/running-in-production/running.md
+++ b/docs/website/docs/running-in-production/running.md
@@ -10,12 +10,11 @@ When running the pipeline in production, you may consider a few additions to you
```py
import dlt
-from chess import chess
if __name__ == "__main__":
pipeline = dlt.pipeline(pipeline_name="chess_pipeline", destination='duckdb', dataset_name="games_data")
# get data for a few famous players
- data = chess(['magnuscarlsen', 'vincentkeymer', 'dommarajugukesh', 'rpragchess'], start_month="2022/11", end_month="2022/12")
+ data = chess_source(['magnuscarlsen', 'vincentkeymer', 'dommarajugukesh', 'rpragchess'], start_month="2022/11", end_month="2022/12")
load_info = pipeline.run(data)
```
@@ -174,21 +173,22 @@ handler = logging.FileHandler('dlt.log')
logger.addHandler(handler)
```
You can intercept logs by using [loguru](https://loguru.readthedocs.io/en/stable/api/logger.html). To do so, follow the instructions below:
+
```py
import logging
import sys
import dlt
-from loguru import logger
+from loguru import logger as loguru_logger
class InterceptHandler(logging.Handler):
- @logger.catch(default=True, onerror=lambda _: sys.exit(1))
+ @loguru_logger.catch(default=True, onerror=lambda _: sys.exit(1))
def emit(self, record):
# Get the corresponding Loguru level if it exists.
try:
- level = logger.level(record.levelname).name
+ level = loguru_logger.level(record.levelname).name
except ValueError:
level = record.levelno
@@ -198,12 +198,12 @@ class InterceptHandler(logging.Handler):
frame = frame.f_back
depth += 1
- logger.opt(depth=depth, exception=record.exc_info).log(level, record.getMessage())
+ loguru_logger.opt(depth=depth, exception=record.exc_info).log(level, record.getMessage())
logger_dlt = logging.getLogger("dlt")
logger_dlt.addHandler(InterceptHandler())
-logger.add("dlt_loguru.log")
+loguru_logger.add("dlt_loguru.log")
```
## Handle exceptions, failed jobs, and retry the pipeline
@@ -291,14 +291,14 @@ the [tenacity](https://tenacity.readthedocs.io/en/latest/) library. The snippet
steps (`extract`, `normalize`) and for terminal exceptions.
```py
-from tenacity import stop_after_attempt, retry_if_exception, Retrying, retry
+from tenacity import stop_after_attempt, retry_if_exception, Retrying, retry, wait_exponential
from dlt.common.runtime.slack import send_slack_message
from dlt.pipeline.helpers import retry_load
if __name__ == "__main__":
pipeline = dlt.pipeline(pipeline_name="chess_pipeline", destination='duckdb', dataset_name="games_data")
# get data for a few famous players
- data = chess(['magnuscarlsen', 'rpragchess'], start_month="2022/11", end_month="2022/12")
+ data = chess_source(['magnuscarlsen', 'rpragchess'], start_month="2022/11", end_month="2022/12")
try:
for attempt in Retrying(stop=stop_after_attempt(5), wait=wait_exponential(multiplier=1.5, min=4, max=10), retry=retry_if_exception(retry_load()), reraise=True):
@@ -319,7 +319,7 @@ if __name__ == "__main__":
@retry(stop=stop_after_attempt(5), wait=wait_exponential(multiplier=1.5, min=4, max=10), retry=retry_if_exception(retry_load(("extract", "load"))), reraise=True)
def load():
- data = chess(['magnuscarlsen', 'vincentkeymer', 'dommarajugukesh', 'rpragchess'], start_month="2022/11", end_month="2022/12")
+ data = chess_source(['magnuscarlsen', 'vincentkeymer', 'dommarajugukesh', 'rpragchess'], start_month="2022/11", end_month="2022/12")
return pipeline.run(data)
load_info = load()
diff --git a/docs/website/docs/tutorial/load-data-from-an-api.md b/docs/website/docs/tutorial/load-data-from-an-api.md
index 3640f0e8d7..ddfef2cbe8 100644
--- a/docs/website/docs/tutorial/load-data-from-an-api.md
+++ b/docs/website/docs/tutorial/load-data-from-an-api.md
@@ -421,7 +421,7 @@ Let's handle this by changing our `fetch_github_data()` function first:
```py
from dlt.sources.helpers.rest_client.auth import BearerTokenAuth
-def fetch_github_data(endpoint, params={}, access_token=None):
+def fetch_github_data_with_token(endpoint, params={}, access_token=None):
url = f"{BASE_GITHUB_URL}/{endpoint}"
return paginate(
url,
@@ -431,11 +431,11 @@ def fetch_github_data(endpoint, params={}, access_token=None):
@dlt.source
-def github_source(access_token):
+def github_source_with_token(access_token: str):
for endpoint in ["issues", "comments", "traffic/clones"]:
params = {"per_page": 100}
yield dlt.resource(
- fetch_github_data(endpoint, params, access_token),
+ fetch_github_data_with_token(endpoint, params, access_token),
name=endpoint,
write_disposition="merge",
primary_key="id",
@@ -447,7 +447,7 @@ def github_source(access_token):
Here, we added an `access_token` parameter and now we can use it to pass the access token to the request:
```py
-load_info = pipeline.run(github_source(access_token="ghp_XXXXX"))
+load_info = pipeline.run(github_source_with_token(access_token="ghp_XXXXX"))
```
It's a good start. But we'd want to follow the best practices and not hardcode the token in the script. One option is to set the token as an environment variable, load it with `os.getenv()`, and pass it around as a parameter. dlt offers a more convenient way to handle secrets and credentials: it lets you inject the arguments using a special `dlt.secrets.value` argument value.
@@ -456,7 +456,7 @@ To use it, change the `github_source()` function to:
```py
@dlt.source
-def github_source(
+def github_source_with_token(
access_token: str = dlt.secrets.value,
):
...
@@ -482,13 +482,13 @@ Now we can run the script and it will load the data from the `traffic/clones` en
...
@dlt.source
-def github_source(
+def github_source_with_token(
access_token: str = dlt.secrets.value,
):
for endpoint in ["issues", "comments", "traffic/clones"]:
params = {"per_page": 100}
yield dlt.resource(
- fetch_github_data(endpoint, params, access_token),
+ fetch_github_data_with_token(endpoint, params, access_token),
name=endpoint,
write_disposition="merge",
primary_key="id",
@@ -514,7 +514,7 @@ from dlt.sources.helpers.rest_client import paginate
BASE_GITHUB_URL = "https://api.github.com/repos/{repo_name}"
-def fetch_github_data(repo_name, endpoint, params={}, access_token=None):
+def fetch_github_data_with_token_and_params(repo_name, endpoint, params={}, access_token=None):
"""Fetch data from the GitHub API based on repo_name, endpoint, and params."""
url = BASE_GITHUB_URL.format(repo_name=repo_name) + f"/{endpoint}"
return paginate(
@@ -525,14 +525,14 @@ def fetch_github_data(repo_name, endpoint, params={}, access_token=None):
@dlt.source
-def github_source(
+def github_source_with_token_and_repo(
repo_name: str = dlt.config.value,
access_token: str = dlt.secrets.value,
):
for endpoint in ["issues", "comments", "traffic/clones"]:
params = {"per_page": 100}
yield dlt.resource(
- fetch_github_data(repo_name, endpoint, params, access_token),
+ fetch_github_data_with_token_and_params(repo_name, endpoint, params, access_token),
name=endpoint,
write_disposition="merge",
primary_key="id",
diff --git a/docs/website/docs/tutorial/rest-api.md b/docs/website/docs/tutorial/rest-api.md
index e1c4d63daa..56051e80de 100644
--- a/docs/website/docs/tutorial/rest-api.md
+++ b/docs/website/docs/tutorial/rest-api.md
@@ -157,13 +157,13 @@ Let's break down the configuration of the REST API source. It consists of three
```py
config: RESTAPIConfig = {
"client": {
- ...
+ # ...
},
"resource_defaults": {
- ...
+ # ...
},
"resources": [
- ...
+ # ...
],
}
```
diff --git a/docs/website/docs/walkthroughs/add-incremental-configuration.md b/docs/website/docs/walkthroughs/add-incremental-configuration.md
index b51cde8470..a53d114a67 100644
--- a/docs/website/docs/walkthroughs/add-incremental-configuration.md
+++ b/docs/website/docs/walkthroughs/add-incremental-configuration.md
@@ -187,7 +187,7 @@ Here’s a walkthrough:
# Load table "contact", incrementally starting at a given timestamp
source = sql_database().with_resources("contact")
source.contact.apply_hints(incremental=dlt.sources.incremental(
- "created_at", initial_value=datetime(2024, 4, 1, 0, 0, 0)))
+ "created_at", initial_value=datetime.datetime(2024, 4, 1, 0, 0, 0)))
# Run the pipeline
info = pipeline.run(source, write_disposition="append")
@@ -253,7 +253,7 @@ Here’s a walkthrough:
# Merge records, 'contact' table, based on ID and last_modified_at timestamp
source = sql_database().with_resources("contact")
source.contact.apply_hints(incremental=dlt.sources.incremental(
- "last_modified_at", initial_value=datetime(2024, 4, 1, 0, 0, 0)),
+ "last_modified_at", initial_value=datetime.datetime(2024, 4, 1, 0, 0, 0)),
primary_key="id")
# Run the pipeline
diff --git a/docs/website/docs/walkthroughs/add_credentials.md b/docs/website/docs/walkthroughs/add_credentials.md
index 799e448fed..34616bc154 100644
--- a/docs/website/docs/walkthroughs/add_credentials.md
+++ b/docs/website/docs/walkthroughs/add_credentials.md
@@ -115,7 +115,7 @@ import dlt
from dlt.sources.helpers import requests
from dlt.common.configuration.inject import with_config
from dlt.common.configuration.specs import GcpServiceAccountCredentials
-from google.cloud import secretmanager
+from google.cloud import secretmanager # type: ignore[attr-defined]
@with_config(sections=("google_secrets",))
def get_secret_dict(secret_id: str, credentials: GcpServiceAccountCredentials = dlt.secrets.value) -> dict:
diff --git a/docs/website/docs/walkthroughs/adjust-a-schema.md b/docs/website/docs/walkthroughs/adjust-a-schema.md
index 7b05e96d32..d76bdad229 100644
--- a/docs/website/docs/walkthroughs/adjust-a-schema.md
+++ b/docs/website/docs/walkthroughs/adjust-a-schema.md
@@ -47,7 +47,7 @@ import_schema_path="schemas/import"
## 2. Run the pipeline to see the schemas
To see the schemas, you must run your pipeline again. The `schemas` and `import`/`export`
-directories will be created. In each directory, you'll see a `yaml` file (e.g., `chess.schema.yaml`).
+directories will be created. In each directory, you'll see a YAML file (e.g., `chess.schema.yaml`).
Look at the export schema (in the export folder): this is the schema that got inferred from the data
and was used to load it into the destination (e.g., `duckdb`).
@@ -75,7 +75,7 @@ In the next steps, we'll experiment a lot; you will be warned to set `dev_mode=T
:::caution
`dlt` will **not modify** tables after they are created.
-So if you have a `yaml` file, and you change it (e.g., change a data type or add a hint),
+So if you have a YAML file, and you change it (e.g., change a data type or add a hint),
then you need to **delete the dataset**
or set `dev_mode=True`:
```py
@@ -135,14 +135,14 @@ These steps ensure that the column order in your dataset matches your specificat
```py
# Define the data source and reorder columns using add_map
-data_source = resource().add_map(lambda row: {
+my_resource = resource().add_map(lambda row: {
'column3': row['column3'],
'column1': row['column1'],
'column2': row['column2']
})
# Run the pipeline
-load_info = pipeline.run(data_source)
+load_info = pipeline.run(my_resource)
```
In this example, the `add_map` function reorders columns by defining a new mapping. The lambda function specifies the desired order by rearranging the key-value pairs. When the pipeline runs, the data will load with the columns in the new order.
diff --git a/docs/website/docs/walkthroughs/deploy-a-pipeline/deploy-with-airflow-composer.md b/docs/website/docs/walkthroughs/deploy-a-pipeline/deploy-with-airflow-composer.md
index 4700f42689..78fafb6d83 100644
--- a/docs/website/docs/walkthroughs/deploy-a-pipeline/deploy-with-airflow-composer.md
+++ b/docs/website/docs/walkthroughs/deploy-a-pipeline/deploy-with-airflow-composer.md
@@ -135,7 +135,7 @@ default_task_args = {
@dag(
schedule=None,
- start_date=pendulum.datetime(2021, 1, 1),
+ start_date=pendulum.DateTime(2021, 1, 1),
catchup=False,
max_active_runs=1,
default_args=default_task_args
@@ -308,7 +308,7 @@ default_task_args = {
@dag(
schedule=None,
- start_date=pendulum.datetime(2021, 1, 1),
+ start_date=pendulum.DateTime(2021, 1, 1),
catchup=False,
max_active_runs=1,
default_args=default_task_args
diff --git a/docs/website/docs/walkthroughs/deploy-a-pipeline/deploy-with-dagster.md b/docs/website/docs/walkthroughs/deploy-a-pipeline/deploy-with-dagster.md
index aa9465bc53..7cd2b6f881 100644
--- a/docs/website/docs/walkthroughs/deploy-a-pipeline/deploy-with-dagster.md
+++ b/docs/website/docs/walkthroughs/deploy-a-pipeline/deploy-with-dagster.md
@@ -256,12 +256,12 @@ For a complete picture of Dagster's integration with dlt, please refer to their
for query in queries:
dimensions = query["dimensions"]
if "date" not in dimensions:
- dimensions.append("date")
+ dimensions.append("date") # type: ignore[attr-defined]
- resource_name = query["resource_name"]
+ resource_name: str = query["resource_name"] # type: ignore[assignment]
resource_list.append(
bigquery_adapter(
- dlt.resource(basic_report, name=resource_name, write_disposition="append")(
+ dlt.resource(data, name=resource_name, write_disposition="append")(
client=client,
rows_per_page=rows_per_page,
property_id=property_id,
diff --git a/docs/website/docs/walkthroughs/deploy-a-pipeline/deploy-with-google-cloud-functions.md b/docs/website/docs/walkthroughs/deploy-a-pipeline/deploy-with-google-cloud-functions.md
index 7e5683d61b..b3a6b0978a 100644
--- a/docs/website/docs/walkthroughs/deploy-a-pipeline/deploy-with-google-cloud-functions.md
+++ b/docs/website/docs/walkthroughs/deploy-a-pipeline/deploy-with-google-cloud-functions.md
@@ -6,31 +6,24 @@ keywords: [how to, deploy a pipeline, Cloud Function]
# Deploy a pipeline with Google Cloud Functions
-This guide shows you how to deploy a pipeline using the gcloud shell and `dlt` CLI commands. To deploy a pipeline using this method, you must have a working knowledge of GCP and its associated services, such as Cloud Functions, Cloud Source Repositories, Shell Editor, IAM and permissions, and GCP service accounts.
+This guide shows you how to deploy a pipeline using the gcloud shell and dlt CLI commands. To deploy a pipeline using this method, you must have a working knowledge of GCP and its associated services, such as Cloud Functions, IAM and permissions, and GCP service accounts.
-To deploy a pipeline using GCP Cloud Functions, you'll first need to set up an empty repo in Cloud Source Repositories, a service provided by GCP for hosting repositories, or you can clone it to your local machine and then deploy it using the Google Cloud CLI.
+To deploy a pipeline with GCP Cloud Functions, navigate to the directory on your local machine or cloud repository (e.g., GitHub, Bitbucket) from where the function code is to be deployed.
-## 1. Setup pipeline in Google Cloud Repositories
+## 1. Setup pipeline
-To deploy the pipeline, we'll use the Google Cloud Source Repositories method.
-
-1. Sign in to your GCP account and enable the Cloud Functions API.
-1. To set up the environment, you can follow these steps:
- - Create an empty repo in Cloud Source Repositories.
- - After creating the repo, click Edit repo to open it in a "Shell Editor".
- - You can also skip creating the repo and use the Shell Editor directly, depending on your requirements.
-1. In this guide, we'll be setting up the `dlt`
+1. In this guide, we'll be setting up the dlt
[Notion verified source](../../dlt-ecosystem/verified-sources/notion). However, you can use any verified source or create a custom one to suit your needs.
-1. In the Shell Editor:
+1. In the terminal:
- Run the following command to initialize the verified source with Notion and create a pipeline example with BigQuery as the target.
```sh
dlt init notion bigquery
```
- - After the command is executed, new files and folders with the necessary configurations are created in the main directory where the command was executed.
+ - After the command executes, new files and folders with the necessary configurations are created in the main directory where the command was executed.
- - Detailed information about initializing a verified source and a pipeline example can be found in the `dlthub` [documentation](../../dlt-ecosystem/verified-sources/notion).
+ - Detailed information about initializing a verified source and a pipeline example can be found in the dlthub [documentation](../../dlt-ecosystem/verified-sources/notion).
1. Create a new Python file called "main.py" in the main directory. The file can be configured as follows:
```py
from notion_pipeline import load_databases
@@ -39,19 +32,20 @@ To deploy the pipeline, we'll use the Google Cloud Source Repositories method.
load_databases()
return "Pipeline run successfully!"
```
- By default, Google Cloud Functions looks for the main.py file in the main directory, and we called the `load_databases()` function from notion_pipeline.py as shown above.
-1. If you need any additional dependencies, add them to `requirements.txt` that got created.
+ By default, Google Cloud Functions looks for the "main.py" file in the directory.
+
+1. If you need any additional dependencies, add them to the "requirements.txt" that was created.
## 2. Deploying GCP Cloud Function
-In a Shell Editor, navigate to the main directory where the "main.py" file is located and run the following command in the terminal:
+In the terminal, navigate to the directory where the "main.py" file is located and run the following command in the terminal:
```sh
gcloud functions deploy pipeline_notion --runtime python310 \
--trigger-http --allow-unauthenticated --source . --timeout 300
```
-- This command uses a function called "pipeline_notion" with Python 3.10 as the runtime environment, an HTTP trigger, and allows unauthenticated access. The source "." refers to all files in the directory. The timeout is set to 5 minutes (300 seconds).
+- This command uses a function called `pipeline_notion` with Python 3.10 as the runtime environment, an HTTP trigger, and allows unauthenticated access. The source "." refers to all files in the directory. The timeout is set to 5 minutes (300 seconds). To learn more about deploying the cloud function, read the [documentation here.](https://cloud.google.com/functions/docs/deploy)
- If you are uploading a large number of files to the destination, you can increase this to 60 minutes for HTTP functions and 10 minutes for event-driven functions. To learn more about the function timeout, see the [documentation here](https://cloud.google.com/functions/docs/configuring/timeout).
> Your project has a default service account associated with the project ID. Please assign the `Cloud Functions Developer` role to the associated service account.
@@ -62,26 +56,26 @@ Environmental variables can be declared in the Cloud Function in two ways:
#### 3a. Directly in the function:
-- Go to the Google Cloud Function and select the deployed function. Click 'EDIT'.
-- Navigate to the 'BUILD' tab and click 'ADD VARIABLE' under 'BUILD ENVIRONMENTAL VARIABLE'.
+- Go to the Google Cloud Function and select the deployed function. Click "EDIT".
+- Navigate to the "BUILD" tab and click "ADD VARIABLE" under "BUILD ENVIRONMENTAL VARIABLE".
- Enter a name for the variable that corresponds to the argument required by the pipeline. Make sure
to capitalize the variable name if it is specified in "secrets.toml". For example, if the variable
- name is `api_key`, set the variable name to "API_KEY".
+ name is `api_key`, set the variable name to `API_KEY`.
- Enter the value for the Notion API key.
- Click Next and deploy the function.
#### 3b. Use GCP Secret Manager:
-- Go to the Google Cloud function and select the function you deployed. Click 'EDIT'.
-- In the 'Runtime, Build, Connections and Security Settings' section, select 'Security and Images
- Repo'.
-- Click 'Add a secret reference' and select the secret you created, for example, 'notion_secret'.
-- Set the 'Reference method' to 'Mounted as environment variable'.
-- In the 'Environment Variable' field, enter the name of the environment variable that corresponds
+- Go to the Google Cloud function and select the function you deployed. Click "EDIT".
+- In the "Runtime, Build, Connections and Security Settings" section, select "Security and Images
+ Repo".
+- Click "Add a secret reference" and select the secret you created, for example, "notion_secret".
+- Set the "Reference method" to "Mounted as environment variable".
+- In the "Environment Variable" field, enter the environment variable's name that corresponds
to the argument required by the pipeline. Remember to capitalize the variable name if it is
required by the pipeline and specified in secrets.toml. For example, if the variable name is
- api_key, you would declare the environment variable as "API_KEY".
-- Finally, click 'DEPLOY' to deploy the function. The HTTP trigger will now successfully execute the
+ `api_key`, you would declare the environment variable as `API_KEY`.
+- Finally, click "DEPLOY" to deploy the function. The HTTP trigger will now successfully execute the
pipeline each time the URL is triggered.
- Assign the `Secret Manager Secret Accessor` role to the service account used to deploy the cloud
function. Typically, this is the default service account associated with the Google Project in
@@ -90,8 +84,8 @@ Environmental variables can be declared in the Cloud Function in two ways:
## 4. Monitor (and manually trigger) the cloud function
To manually trigger the created function, you can open the trigger URL created by the Cloud Function
-in the address bar. The message "Pipeline run successfully!" would mean that the pipeline was
+in the address bar. The message "Pipeline run successfully!" confirms that the pipeline was
successfully run and the data was successfully loaded into the destination.
-That's it! Have fun using `dlt` in Google Cloud Functions!
+That's it! Have fun using dlt in Google Cloud Functions!
diff --git a/docs/website/docs/walkthroughs/deploy-a-pipeline/deploy-with-google-cloud-run.md b/docs/website/docs/walkthroughs/deploy-a-pipeline/deploy-with-google-cloud-run.md
new file mode 100644
index 0000000000..7dcb3979b0
--- /dev/null
+++ b/docs/website/docs/walkthroughs/deploy-a-pipeline/deploy-with-google-cloud-run.md
@@ -0,0 +1,90 @@
+---
+title: Deploy with Google Cloud Run
+description: Step-by-step guide on deploying a pipeline with Google Cloud Run.
+keywords: [how to, deploy a pipeline, Cloud Run]
+---
+
+# Deploy a pipeline with Google Cloud Run
+
+This guide explains how to deploy a pipeline using the gcloud shell and dlt CLI commands. To deploy a pipeline using this method, you must have a working knowledge of GCP and its associated services, such as Cloud Run jobs, IAM and permissions, and GCP service accounts.
+
+Deploy the pipeline using Google Cloud Run jobs. First, navigate to the directory on your local machine or cloud repository (e.g., GitHub, Bitbucket) where you want to create the function code for deployment.
+
+## 1. Setup pipeline
+
+1. In this guide, we set up the dlt
+ [Notion verified source](../../dlt-ecosystem/verified-sources/notion). However, you can use any verified source or create a custom one.
+
+1. Run the following command to initialize the verified source with Notion and create a pipeline example with BigQuery as the target.
+
+ ```sh
+ dlt init notion bigquery
+ ```
+
+ - After the command executes, new files and folders with the necessary configurations are created in the main directory.
+
+ - Detailed information about initializing a verified source and a pipeline example is available in the dlthub [documentation](../../dlt-ecosystem/verified-sources/notion).
+1. Create a new file named "Procfile" in the main directory and configure it as follows:
+ ```text
+ web: python3 notion_pipeline.py
+ ```
+ This instructs the Cloud Run job to run "notion_pipeline.py", using python3.
+
+1. If you need any additional dependencies, add them to the "requirements.txt" that was created.
+
+## 2. Deploying GCP Cloud Run Jobs
+
+In the terminal, navigate to the directory where the "notion_pipeline.py" file is located and run the following command in the terminal:
+
+```sh
+gcloud run jobs deploy notion-pipeline-job \
+ --source . \
+ --tasks 1 \
+ --max-retries 5 \
+ --cpu 4 \
+ --memory 4Gi \
+ --region us-central1 \
+ --project dlthub-sandbox
+```
+
+- This command creates a Cloud Run job. The source "." refers to all files in the directory. The number of vCPUs is set to 4 and memory 4GiB. You can tweak the parameters as per your requirement. To learn more about deploying the Cloud Run job, read the [documentation here.](https://cloud.google.com/run/docs/create-jobs#gcloud)
+- By default, Cloud Run jobs have a 10-minute timeout, you can increase this up to 1440 minutes (24 hours). To learn more about the function timeout, see the [documentation here](https://cloud.google.com/run/docs/configuring/task-timeout).
+
+> Your project has a default service account associated with the project ID. Please assign the `roles/run.invoker` role to the associated service account.
+
+## 3. Setting up environment variables in Cloud Run
+Do not add secrets directly to the "secrets.toml" file, as it will be included in the deployed container for executing the job. Instead, use environment variables or Google Secrets Manager, as described below.
+Environment variables can be set in Cloud Run in two ways:
+
+#### 3a. Directly in the function:
+
+- Go to the Google Cloud Run job and select the deployed function. Click "VIEW AND EDIT JOB CONFIGURATION".
+- In the "CONTAINERS" > "VARIABLE AND SECRETS" > "ADD VARIABLE".
+- Enter a name for the variable according to the pipeline's requirements. Make sure
+ to capitalize the variable name if it is specified in "secrets.toml". For example, if the variable
+ name is `sources.notion.api_key`, set the variable name to `SOURCES__NOTION__API_KEY`.
+- Enter the value for the Notion API key.
+- Click "Done" and update the function.
+
+#### 3b. Use GCP Secret Manager:
+
+- Go to the Google Cloud Run job and select the deployed function. Click "VIEW AND EDIT JOB CONFIGURATION".
+- In the "Containers" > "VARIABLE AND SECRETS" > "ADD VARIABLE".
+- Click "Add a secret reference" and select the secret you created, for example, "notion_secret".
+- Set the "REFERENCE A SECRET" to mounted as an environment variable.
+- In the "Environment Variable" field, enter the environment variable's name that corresponds
+ to the argument required by the pipeline. Remember to capitalize the variable name if it is
+ required by the pipeline and specified in secrets.toml. For example, if the variable name is
+ `sources.notion.api_key`, you would declare the environment variable as `SOURCES__NOTION__API_KEY`.
+- Select the secret to reference.
+- Click "Done" and update the function.
+- “Assign the Secret Manager Secret Accessor role to the Cloud Run service account.
+ Typically, this is the default service account associated with the Google Project in
+ which the function is being created.
+
+## 4. Monitor (and manually trigger) the Cloud Run
+
+To manually trigger the job, click "EXECUTE". You can also set up a scheduled trigger to automate runs.
+
+That's it! Have fun using dlt in Google Cloud Run!
+
diff --git a/docs/website/docs/walkthroughs/deploy-a-pipeline/deploy-with-modal.md b/docs/website/docs/walkthroughs/deploy-a-pipeline/deploy-with-modal.md
new file mode 100644
index 0000000000..799fe2fe7f
--- /dev/null
+++ b/docs/website/docs/walkthroughs/deploy-a-pipeline/deploy-with-modal.md
@@ -0,0 +1,82 @@
+---
+title: Deploy with Modal
+description: How to deploy a pipeline with Modal
+keywords: [how to, deploy a pipeline, Modal]
+canonical: https://modal.com/blog/analytics-stack
+---
+
+# Deploy with Modal
+
+## Introduction to Modal
+
+[Modal](https://modal.com/) is a serverless platform designed for developers. It allows you to run and deploy code in the cloud without managing infrastructure.
+
+With Modal, you can perform tasks like running generative models, large-scale batch jobs, and job queues, all while easily scaling compute resources.
+
+### Modal features
+
+- Serverless Compute: No infrastructure management; scales automatically from zero to thousands of CPUs/GPUs.
+- Cloud Functions: Run Python code in the cloud instantly and scale horizontally.
+- GPU/CPU Scaling: Easily attach GPUs for heavy tasks like AI model training with a single line of code.
+- Web Endpoints: Expose any function as an HTTPS API endpoint quickly.
+- Scheduled Jobs: Convert Python functions into scheduled tasks effortlessly.
+
+To learn more, please refer to [Modal's documentation.](https://modal.com/docs)
+
+
+## How to run dlt on Modal
+
+Here’s a dlt project setup to copy data from public MySQL database into DuckDB as a destination:
+
+### Step 1: Initialize source
+Run the `dlt init` CLI command to initialize the SQL database source and set up the `sql_database_pipeline.py` template.
+```sh
+dlt init sql_database duckdb
+```
+
+### Step 2: Define Modal Image
+Open the file and define the Modal Image you want to run `dlt` in:
+
+
+### Step 3: Define Modal Function
+A Modal Function is a containerized environment that runs tasks.
+It can be scheduled (e.g., daily or on a Cron schedule), request more CPU/memory, and scale across
+multiple containers.
+
+Here’s how to include your SQL pipeline in the Modal Function:
+
+
+
+### Step 4: Set up credentials
+You can securely store your credentials using Modal secrets. When you reference secrets within a Modal script,
+the defined secret is automatically set as an environment variable. dlt natively supports environment variables,
+enabling seamless integration of your credentials. For example, to declare a connection string, you can define it as follows:
+```text
+SOURCES__SQL_DATABASE__CREDENTIALS=mysql+pymysql://rfamro@mysql-rfam-public.ebi.ac.uk:4497/Rfam
+```
+In the script above, the credentials specified are automatically utilized by dlt.
+For more details, please refer to the [documentation.](../../general-usage/credentials/setup#environment-variables)
+
+### Step 5: Run pipeline
+Execute the pipeline once.
+To run your pipeline a single time, use the following command:
+```sh
+modal run sql_pipeline.py
+```
+
+### Step 6: Deploy
+If you want to deploy your pipeline on Modal for continuous execution or scheduling, use this command:
+```sh
+modal deploy sql_pipeline.py
+```
+
+## Advanced configuration
+* Use [Proxy IPs](https://modal.com/docs/guide/proxy-ips) to connect to resources in your private network
+* Sync tables in parallel using [map()](https://modal.com/docs/guide/scale)
+
+
+## More examples
+
+For a practical, real-world example, check out the article ["Building a Cost-Effective Analytics Stack with Modal, dlt, and dbt"](https://modal.com/blog/analytics-stack).
+
+This article illustrates how to automate a workflow for loading data from Postgres into Snowflake using dlt, providing valuable insights into building an efficient analytics pipeline.
diff --git a/docs/website/docs/walkthroughs/deploy-a-pipeline/deploy_snippets/__init__.py b/docs/website/docs/walkthroughs/deploy-a-pipeline/deploy_snippets/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/docs/website/docs/walkthroughs/deploy-a-pipeline/deploy_snippets/deploy-with-modal-snippets.py b/docs/website/docs/walkthroughs/deploy-a-pipeline/deploy_snippets/deploy-with-modal-snippets.py
new file mode 100644
index 0000000000..4857984759
--- /dev/null
+++ b/docs/website/docs/walkthroughs/deploy-a-pipeline/deploy_snippets/deploy-with-modal-snippets.py
@@ -0,0 +1,69 @@
+import os
+
+import modal
+
+from tests.pipeline.utils import assert_load_info
+
+
+def test_modal_snippet() -> None:
+ # @@@DLT_SNIPPET_START modal_image
+ # Define the Modal Image
+ image = modal.Image.debian_slim().pip_install(
+ "dlt>=1.1.0",
+ "dlt[duckdb]", # destination
+ "dlt[sql_database]", # source (MySQL)
+ "pymysql", # database driver for MySQL source
+ )
+
+ app = modal.App("example-dlt", image=image)
+
+ # Modal Volume used to store the duckdb database file
+ vol = modal.Volume.from_name("duckdb-vol", create_if_missing=True)
+ # @@@DLT_SNIPPET_END modal_image
+
+ # @@@DLT_SNIPPET_START modal_function
+ @app.function(
+ volumes={"/data/": vol},
+ schedule=modal.Period(days=1),
+ secrets=[modal.Secret.from_name("sql-secret")],
+ serialized=True,
+ )
+ def load_tables() -> None:
+ import dlt
+ from dlt.sources.sql_database import sql_database
+
+ # Define the source database credentials; in production, you would save this as a Modal Secret which can be referenced here as an environment variable
+ os.environ["SOURCES__SQL_DATABASE__CREDENTIALS"] = (
+ "mysql+pymysql://rfamro@mysql-rfam-public.ebi.ac.uk:4497/Rfam"
+ )
+ # Load tables "family" and "genome"
+ source = sql_database().with_resources("family", "genome")
+
+ # Create dlt pipeline object
+ pipeline = dlt.pipeline(
+ pipeline_name="sql_to_duckdb_pipeline",
+ destination=dlt.destinations.duckdb(
+ "/data/rfam.duckdb"
+ ), # write the duckdb database file to this file location, which will get mounted to the Modal Volume
+ dataset_name="sql_to_duckdb_pipeline_data",
+ progress="log", # output progress of the pipeline
+ )
+
+ # Run the pipeline
+ load_info = pipeline.run(source)
+
+ # Print run statistics
+ print(load_info)
+ # @@@DLT_SNIPPET_END modal_function
+
+ assert_load_info(load_info)
+
+ import pytest
+ from modal.exception import ExecutionError
+
+ # Any additional logic or calling the function
+ with pytest.raises(ExecutionError) as excinfo:
+ load_tables.remote()
+ # >> modal.exception.ExecutionError:
+ # >> Function has not been hydrated with the metadata it needs to run on Modal, because the App it is defined on is not running.
+ assert "hydrated" in str(excinfo.value)
diff --git a/docs/website/docs/walkthroughs/run-a-pipeline.md b/docs/website/docs/walkthroughs/run-a-pipeline.md
index 0be66b448b..49abe8675f 100644
--- a/docs/website/docs/walkthroughs/run-a-pipeline.md
+++ b/docs/website/docs/walkthroughs/run-a-pipeline.md
@@ -18,12 +18,11 @@ like the one below that loads data from the [chess.com](https://www.chess.com) A
```py
import dlt
-from chess import chess
if __name__ == "__main__":
pipeline = dlt.pipeline(pipeline_name="chess_pipeline", destination='duckdb', dataset_name="games_data")
# get data for a few famous players
- data = chess(['magnuscarlsen', 'rpragchess'], start_month="2022/11", end_month="2022/12")
+ data = chess_source(['magnuscarlsen', 'rpragchess'], start_month="2022/11", end_month="2022/12")
load_info = pipeline.run(data)
```
@@ -55,7 +54,7 @@ progress bar libraries, Python loggers, or just a text console. To demonstrate,
script to get a year of chess games data:
```py
-data = chess(['magnuscarlsen', 'rpragchess'], start_month="2021/11", end_month="2022/12")
+data = chess_source(['magnuscarlsen', 'rpragchess'], start_month="2021/11", end_month="2022/12")
```
Install [enlighten](https://github.com/Rockhopper-Technologies/enlighten). Enlighten displays
diff --git a/docs/website/docs/walkthroughs/share-a-dataset.md b/docs/website/docs/walkthroughs/share-a-dataset.md
index 4e7ea9f843..c45bfc8315 100644
--- a/docs/website/docs/walkthroughs/share-a-dataset.md
+++ b/docs/website/docs/walkthroughs/share-a-dataset.md
@@ -17,7 +17,6 @@ BigQuery:
```py
import dlt
-from chess import chess
if __name__ == "__main__":
pipeline = dlt.pipeline(
@@ -26,7 +25,7 @@ if __name__ == "__main__":
dataset_name="games_data"
)
# get data for a few famous players
- data = chess(
+ data = chess_source(
data=['magnuscarlsen', 'rpragchess'],
start_month="2022/11",
end_month="2022/12"
diff --git a/docs/website/docusaurus.config.js b/docs/website/docusaurus.config.js
index c76ea38191..e825e4b9af 100644
--- a/docs/website/docusaurus.config.js
+++ b/docs/website/docusaurus.config.js
@@ -93,9 +93,6 @@ const config = {
theme: {
customCss: require.resolve('./src/css/custom.css'),
},
- gtag: {
- trackingID: ['G-7F1SE12JLR', 'G-PRHSCL1CMK'],
- },
}),
],
],
@@ -202,6 +199,10 @@ const config = {
async: true,
defer: true,
},
+ {
+ src: 'https://dlthub.com/js/tm.js',
+ async: true,
+ },
],
};
diff --git a/docs/website/sidebars.js b/docs/website/sidebars.js
index a87616990b..1edee20c81 100644
--- a/docs/website/sidebars.js
+++ b/docs/website/sidebars.js
@@ -256,7 +256,8 @@ const sidebars = {
'walkthroughs/add-incremental-configuration',
'general-usage/full-loading',
]
- }
+ },
+ 'dlt-ecosystem/notebooks'
]
},
{
@@ -278,10 +279,12 @@ const sidebars = {
'walkthroughs/deploy-a-pipeline/deploy-with-airflow-composer',
'reference/explainers/airflow-gcp-cloud-composer',
'walkthroughs/deploy-a-pipeline/deploy-with-google-cloud-functions',
+ 'walkthroughs/deploy-a-pipeline/deploy-with-google-cloud-run',
'walkthroughs/deploy-a-pipeline/deploy-gcp-cloud-function-as-webhook',
'walkthroughs/deploy-a-pipeline/deploy-with-kestra',
'walkthroughs/deploy-a-pipeline/deploy-with-dagster',
'walkthroughs/deploy-a-pipeline/deploy-with-prefect',
+ 'walkthroughs/deploy-a-pipeline/deploy-with-modal',
]
},
{
@@ -343,7 +346,7 @@ const sidebars = {
title: 'File formats',
description: 'Overview of our loader file formats',
slug: 'dlt-ecosystem/file-formats',
- keywords: ['destination'],
+ keywords: ['destination', 'file formats'],
},
items: [
'dlt-ecosystem/file-formats/jsonl',
@@ -352,6 +355,20 @@ const sidebars = {
'dlt-ecosystem/file-formats/insert-format',
]
},
+ {
+ type: 'category',
+ label: 'Table formats',
+ link: {
+ type: 'generated-index',
+ title: 'Table formats',
+ slug: 'dlt-ecosystem/table-formats',
+ keywords: ['destination, table formats'],
+ },
+ items: [
+ 'dlt-ecosystem/table-formats/delta',
+ 'dlt-ecosystem/table-formats/iceberg',
+ ]
+ },
'reference/frequently-asked-questions',
],
},
diff --git a/poetry.lock b/poetry.lock
index f66c41aff6..00980992fa 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -79,112 +79,127 @@ awscli = ["awscli (>=1.31.2,<1.32.35)"]
boto3 = ["boto3 (>=1.33.2,<1.34.35)"]
[[package]]
-name = "aiohttp"
-version = "3.8.5"
-description = "Async http client/server framework (asyncio)"
+name = "aiohappyeyeballs"
+version = "2.4.3"
+description = "Happy Eyeballs for asyncio"
optional = false
-python-versions = ">=3.6"
+python-versions = ">=3.8"
files = [
- {file = "aiohttp-3.8.5-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:a94159871304770da4dd371f4291b20cac04e8c94f11bdea1c3478e557fbe0d8"},
- {file = "aiohttp-3.8.5-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:13bf85afc99ce6f9ee3567b04501f18f9f8dbbb2ea11ed1a2e079670403a7c84"},
- {file = "aiohttp-3.8.5-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:2ce2ac5708501afc4847221a521f7e4b245abf5178cf5ddae9d5b3856ddb2f3a"},
- {file = "aiohttp-3.8.5-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:96943e5dcc37a6529d18766597c491798b7eb7a61d48878611298afc1fca946c"},
- {file = "aiohttp-3.8.5-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:2ad5c3c4590bb3cc28b4382f031f3783f25ec223557124c68754a2231d989e2b"},
- {file = "aiohttp-3.8.5-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:0c413c633d0512df4dc7fd2373ec06cc6a815b7b6d6c2f208ada7e9e93a5061d"},
- {file = "aiohttp-3.8.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:df72ac063b97837a80d80dec8d54c241af059cc9bb42c4de68bd5b61ceb37caa"},
- {file = "aiohttp-3.8.5-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c48c5c0271149cfe467c0ff8eb941279fd6e3f65c9a388c984e0e6cf57538e14"},
- {file = "aiohttp-3.8.5-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:368a42363c4d70ab52c2c6420a57f190ed3dfaca6a1b19afda8165ee16416a82"},
- {file = "aiohttp-3.8.5-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:7607ec3ce4993464368505888af5beb446845a014bc676d349efec0e05085905"},
- {file = "aiohttp-3.8.5-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:0d21c684808288a98914e5aaf2a7c6a3179d4df11d249799c32d1808e79503b5"},
- {file = "aiohttp-3.8.5-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:312fcfbacc7880a8da0ae8b6abc6cc7d752e9caa0051a53d217a650b25e9a691"},
- {file = "aiohttp-3.8.5-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:ad093e823df03bb3fd37e7dec9d4670c34f9e24aeace76808fc20a507cace825"},
- {file = "aiohttp-3.8.5-cp310-cp310-win32.whl", hash = "sha256:33279701c04351a2914e1100b62b2a7fdb9a25995c4a104259f9a5ead7ed4802"},
- {file = "aiohttp-3.8.5-cp310-cp310-win_amd64.whl", hash = "sha256:6e4a280e4b975a2e7745573e3fc9c9ba0d1194a3738ce1cbaa80626cc9b4f4df"},
- {file = "aiohttp-3.8.5-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:ae871a964e1987a943d83d6709d20ec6103ca1eaf52f7e0d36ee1b5bebb8b9b9"},
- {file = "aiohttp-3.8.5-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:461908b2578955045efde733719d62f2b649c404189a09a632d245b445c9c975"},
- {file = "aiohttp-3.8.5-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:72a860c215e26192379f57cae5ab12b168b75db8271f111019509a1196dfc780"},
- {file = "aiohttp-3.8.5-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:cc14be025665dba6202b6a71cfcdb53210cc498e50068bc088076624471f8bb9"},
- {file = "aiohttp-3.8.5-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:8af740fc2711ad85f1a5c034a435782fbd5b5f8314c9a3ef071424a8158d7f6b"},
- {file = "aiohttp-3.8.5-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:841cd8233cbd2111a0ef0a522ce016357c5e3aff8a8ce92bcfa14cef890d698f"},
- {file = "aiohttp-3.8.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5ed1c46fb119f1b59304b5ec89f834f07124cd23ae5b74288e364477641060ff"},
- {file = "aiohttp-3.8.5-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:84f8ae3e09a34f35c18fa57f015cc394bd1389bce02503fb30c394d04ee6b938"},
- {file = "aiohttp-3.8.5-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:62360cb771707cb70a6fd114b9871d20d7dd2163a0feafe43fd115cfe4fe845e"},
- {file = "aiohttp-3.8.5-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:23fb25a9f0a1ca1f24c0a371523546366bb642397c94ab45ad3aedf2941cec6a"},
- {file = "aiohttp-3.8.5-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:b0ba0d15164eae3d878260d4c4df859bbdc6466e9e6689c344a13334f988bb53"},
- {file = "aiohttp-3.8.5-cp311-cp311-musllinux_1_1_s390x.whl", hash = "sha256:5d20003b635fc6ae3f96d7260281dfaf1894fc3aa24d1888a9b2628e97c241e5"},
- {file = "aiohttp-3.8.5-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:0175d745d9e85c40dcc51c8f88c74bfbaef9e7afeeeb9d03c37977270303064c"},
- {file = "aiohttp-3.8.5-cp311-cp311-win32.whl", hash = "sha256:2e1b1e51b0774408f091d268648e3d57f7260c1682e7d3a63cb00d22d71bb945"},
- {file = "aiohttp-3.8.5-cp311-cp311-win_amd64.whl", hash = "sha256:043d2299f6dfdc92f0ac5e995dfc56668e1587cea7f9aa9d8a78a1b6554e5755"},
- {file = "aiohttp-3.8.5-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:cae533195e8122584ec87531d6df000ad07737eaa3c81209e85c928854d2195c"},
- {file = "aiohttp-3.8.5-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4f21e83f355643c345177a5d1d8079f9f28b5133bcd154193b799d380331d5d3"},
- {file = "aiohttp-3.8.5-cp36-cp36m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:a7a75ef35f2df54ad55dbf4b73fe1da96f370e51b10c91f08b19603c64004acc"},
- {file = "aiohttp-3.8.5-cp36-cp36m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:2e2e9839e14dd5308ee773c97115f1e0a1cb1d75cbeeee9f33824fa5144c7634"},
- {file = "aiohttp-3.8.5-cp36-cp36m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c44e65da1de4403d0576473e2344828ef9c4c6244d65cf4b75549bb46d40b8dd"},
- {file = "aiohttp-3.8.5-cp36-cp36m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:78d847e4cde6ecc19125ccbc9bfac4a7ab37c234dd88fbb3c5c524e8e14da543"},
- {file = "aiohttp-3.8.5-cp36-cp36m-musllinux_1_1_aarch64.whl", hash = "sha256:c7a815258e5895d8900aec4454f38dca9aed71085f227537208057853f9d13f2"},
- {file = "aiohttp-3.8.5-cp36-cp36m-musllinux_1_1_i686.whl", hash = "sha256:8b929b9bd7cd7c3939f8bcfffa92fae7480bd1aa425279d51a89327d600c704d"},
- {file = "aiohttp-3.8.5-cp36-cp36m-musllinux_1_1_ppc64le.whl", hash = "sha256:5db3a5b833764280ed7618393832e0853e40f3d3e9aa128ac0ba0f8278d08649"},
- {file = "aiohttp-3.8.5-cp36-cp36m-musllinux_1_1_s390x.whl", hash = "sha256:a0215ce6041d501f3155dc219712bc41252d0ab76474615b9700d63d4d9292af"},
- {file = "aiohttp-3.8.5-cp36-cp36m-musllinux_1_1_x86_64.whl", hash = "sha256:fd1ed388ea7fbed22c4968dd64bab0198de60750a25fe8c0c9d4bef5abe13824"},
- {file = "aiohttp-3.8.5-cp36-cp36m-win32.whl", hash = "sha256:6e6783bcc45f397fdebc118d772103d751b54cddf5b60fbcc958382d7dd64f3e"},
- {file = "aiohttp-3.8.5-cp36-cp36m-win_amd64.whl", hash = "sha256:b5411d82cddd212644cf9360879eb5080f0d5f7d809d03262c50dad02f01421a"},
- {file = "aiohttp-3.8.5-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:01d4c0c874aa4ddfb8098e85d10b5e875a70adc63db91f1ae65a4b04d3344cda"},
- {file = "aiohttp-3.8.5-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e5980a746d547a6ba173fd5ee85ce9077e72d118758db05d229044b469d9029a"},
- {file = "aiohttp-3.8.5-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:2a482e6da906d5e6e653be079b29bc173a48e381600161c9932d89dfae5942ef"},
- {file = "aiohttp-3.8.5-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:80bd372b8d0715c66c974cf57fe363621a02f359f1ec81cba97366948c7fc873"},
- {file = "aiohttp-3.8.5-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c1161b345c0a444ebcf46bf0a740ba5dcf50612fd3d0528883fdc0eff578006a"},
- {file = "aiohttp-3.8.5-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:cd56db019015b6acfaaf92e1ac40eb8434847d9bf88b4be4efe5bfd260aee692"},
- {file = "aiohttp-3.8.5-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:153c2549f6c004d2754cc60603d4668899c9895b8a89397444a9c4efa282aaf4"},
- {file = "aiohttp-3.8.5-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:4a01951fabc4ce26ab791da5f3f24dca6d9a6f24121746eb19756416ff2d881b"},
- {file = "aiohttp-3.8.5-cp37-cp37m-musllinux_1_1_ppc64le.whl", hash = "sha256:bfb9162dcf01f615462b995a516ba03e769de0789de1cadc0f916265c257e5d8"},
- {file = "aiohttp-3.8.5-cp37-cp37m-musllinux_1_1_s390x.whl", hash = "sha256:7dde0009408969a43b04c16cbbe252c4f5ef4574ac226bc8815cd7342d2028b6"},
- {file = "aiohttp-3.8.5-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:4149d34c32f9638f38f544b3977a4c24052042affa895352d3636fa8bffd030a"},
- {file = "aiohttp-3.8.5-cp37-cp37m-win32.whl", hash = "sha256:68c5a82c8779bdfc6367c967a4a1b2aa52cd3595388bf5961a62158ee8a59e22"},
- {file = "aiohttp-3.8.5-cp37-cp37m-win_amd64.whl", hash = "sha256:2cf57fb50be5f52bda004b8893e63b48530ed9f0d6c96c84620dc92fe3cd9b9d"},
- {file = "aiohttp-3.8.5-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:eca4bf3734c541dc4f374ad6010a68ff6c6748f00451707f39857f429ca36ced"},
- {file = "aiohttp-3.8.5-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:1274477e4c71ce8cfe6c1ec2f806d57c015ebf84d83373676036e256bc55d690"},
- {file = "aiohttp-3.8.5-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:28c543e54710d6158fc6f439296c7865b29e0b616629767e685a7185fab4a6b9"},
- {file = "aiohttp-3.8.5-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:910bec0c49637d213f5d9877105d26e0c4a4de2f8b1b29405ff37e9fc0ad52b8"},
- {file = "aiohttp-3.8.5-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:5443910d662db951b2e58eb70b0fbe6b6e2ae613477129a5805d0b66c54b6cb7"},
- {file = "aiohttp-3.8.5-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:2e460be6978fc24e3df83193dc0cc4de46c9909ed92dd47d349a452ef49325b7"},
- {file = "aiohttp-3.8.5-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fb1558def481d84f03b45888473fc5a1f35747b5f334ef4e7a571bc0dfcb11f8"},
- {file = "aiohttp-3.8.5-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:34dd0c107799dcbbf7d48b53be761a013c0adf5571bf50c4ecad5643fe9cfcd0"},
- {file = "aiohttp-3.8.5-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:aa1990247f02a54185dc0dff92a6904521172a22664c863a03ff64c42f9b5410"},
- {file = "aiohttp-3.8.5-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:0e584a10f204a617d71d359fe383406305a4b595b333721fa50b867b4a0a1548"},
- {file = "aiohttp-3.8.5-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:a3cf433f127efa43fee6b90ea4c6edf6c4a17109d1d037d1a52abec84d8f2e42"},
- {file = "aiohttp-3.8.5-cp38-cp38-musllinux_1_1_s390x.whl", hash = "sha256:c11f5b099adafb18e65c2c997d57108b5bbeaa9eeee64a84302c0978b1ec948b"},
- {file = "aiohttp-3.8.5-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:84de26ddf621d7ac4c975dbea4c945860e08cccde492269db4e1538a6a6f3c35"},
- {file = "aiohttp-3.8.5-cp38-cp38-win32.whl", hash = "sha256:ab88bafedc57dd0aab55fa728ea10c1911f7e4d8b43e1d838a1739f33712921c"},
- {file = "aiohttp-3.8.5-cp38-cp38-win_amd64.whl", hash = "sha256:5798a9aad1879f626589f3df0f8b79b3608a92e9beab10e5fda02c8a2c60db2e"},
- {file = "aiohttp-3.8.5-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:a6ce61195c6a19c785df04e71a4537e29eaa2c50fe745b732aa937c0c77169f3"},
- {file = "aiohttp-3.8.5-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:773dd01706d4db536335fcfae6ea2440a70ceb03dd3e7378f3e815b03c97ab51"},
- {file = "aiohttp-3.8.5-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:f83a552443a526ea38d064588613aca983d0ee0038801bc93c0c916428310c28"},
- {file = "aiohttp-3.8.5-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1f7372f7341fcc16f57b2caded43e81ddd18df53320b6f9f042acad41f8e049a"},
- {file = "aiohttp-3.8.5-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ea353162f249c8097ea63c2169dd1aa55de1e8fecbe63412a9bc50816e87b761"},
- {file = "aiohttp-3.8.5-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:e5d47ae48db0b2dcf70bc8a3bc72b3de86e2a590fc299fdbbb15af320d2659de"},
- {file = "aiohttp-3.8.5-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d827176898a2b0b09694fbd1088c7a31836d1a505c243811c87ae53a3f6273c1"},
- {file = "aiohttp-3.8.5-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:3562b06567c06439d8b447037bb655ef69786c590b1de86c7ab81efe1c9c15d8"},
- {file = "aiohttp-3.8.5-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:4e874cbf8caf8959d2adf572a78bba17cb0e9d7e51bb83d86a3697b686a0ab4d"},
- {file = "aiohttp-3.8.5-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:6809a00deaf3810e38c628e9a33271892f815b853605a936e2e9e5129762356c"},
- {file = "aiohttp-3.8.5-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:33776e945d89b29251b33a7e7d006ce86447b2cfd66db5e5ded4e5cd0340585c"},
- {file = "aiohttp-3.8.5-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:eaeed7abfb5d64c539e2db173f63631455f1196c37d9d8d873fc316470dfbacd"},
- {file = "aiohttp-3.8.5-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:e91d635961bec2d8f19dfeb41a539eb94bd073f075ca6dae6c8dc0ee89ad6f91"},
- {file = "aiohttp-3.8.5-cp39-cp39-win32.whl", hash = "sha256:00ad4b6f185ec67f3e6562e8a1d2b69660be43070bd0ef6fcec5211154c7df67"},
- {file = "aiohttp-3.8.5-cp39-cp39-win_amd64.whl", hash = "sha256:c0a9034379a37ae42dea7ac1e048352d96286626251862e448933c0f59cbd79c"},
- {file = "aiohttp-3.8.5.tar.gz", hash = "sha256:b9552ec52cc147dbf1944ac7ac98af7602e51ea2dcd076ed194ca3c0d1c7d0bc"},
+ {file = "aiohappyeyeballs-2.4.3-py3-none-any.whl", hash = "sha256:8a7a83727b2756f394ab2895ea0765a0a8c475e3c71e98d43d76f22b4b435572"},
+ {file = "aiohappyeyeballs-2.4.3.tar.gz", hash = "sha256:75cf88a15106a5002a8eb1dab212525c00d1f4c0fa96e551c9fbe6f09a621586"},
]
-[package.dependencies]
+[[package]]
+name = "aiohttp"
+version = "3.10.10"
+description = "Async http client/server framework (asyncio)"
+optional = false
+python-versions = ">=3.8"
+files = [
+ {file = "aiohttp-3.10.10-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:be7443669ae9c016b71f402e43208e13ddf00912f47f623ee5994e12fc7d4b3f"},
+ {file = "aiohttp-3.10.10-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:7b06b7843929e41a94ea09eb1ce3927865387e3e23ebe108e0d0d09b08d25be9"},
+ {file = "aiohttp-3.10.10-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:333cf6cf8e65f6a1e06e9eb3e643a0c515bb850d470902274239fea02033e9a8"},
+ {file = "aiohttp-3.10.10-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:274cfa632350225ce3fdeb318c23b4a10ec25c0e2c880eff951a3842cf358ac1"},
+ {file = "aiohttp-3.10.10-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d9e5e4a85bdb56d224f412d9c98ae4cbd032cc4f3161818f692cd81766eee65a"},
+ {file = "aiohttp-3.10.10-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:2b606353da03edcc71130b52388d25f9a30a126e04caef1fd637e31683033abd"},
+ {file = "aiohttp-3.10.10-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ab5a5a0c7a7991d90446a198689c0535be89bbd6b410a1f9a66688f0880ec026"},
+ {file = "aiohttp-3.10.10-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:578a4b875af3e0daaf1ac6fa983d93e0bbfec3ead753b6d6f33d467100cdc67b"},
+ {file = "aiohttp-3.10.10-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:8105fd8a890df77b76dd3054cddf01a879fc13e8af576805d667e0fa0224c35d"},
+ {file = "aiohttp-3.10.10-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:3bcd391d083f636c06a68715e69467963d1f9600f85ef556ea82e9ef25f043f7"},
+ {file = "aiohttp-3.10.10-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:fbc6264158392bad9df19537e872d476f7c57adf718944cc1e4495cbabf38e2a"},
+ {file = "aiohttp-3.10.10-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:e48d5021a84d341bcaf95c8460b152cfbad770d28e5fe14a768988c461b821bc"},
+ {file = "aiohttp-3.10.10-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:2609e9ab08474702cc67b7702dbb8a80e392c54613ebe80db7e8dbdb79837c68"},
+ {file = "aiohttp-3.10.10-cp310-cp310-win32.whl", hash = "sha256:84afcdea18eda514c25bc68b9af2a2b1adea7c08899175a51fe7c4fb6d551257"},
+ {file = "aiohttp-3.10.10-cp310-cp310-win_amd64.whl", hash = "sha256:9c72109213eb9d3874f7ac8c0c5fa90e072d678e117d9061c06e30c85b4cf0e6"},
+ {file = "aiohttp-3.10.10-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:c30a0eafc89d28e7f959281b58198a9fa5e99405f716c0289b7892ca345fe45f"},
+ {file = "aiohttp-3.10.10-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:258c5dd01afc10015866114e210fb7365f0d02d9d059c3c3415382ab633fcbcb"},
+ {file = "aiohttp-3.10.10-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:15ecd889a709b0080f02721255b3f80bb261c2293d3c748151274dfea93ac871"},
+ {file = "aiohttp-3.10.10-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f3935f82f6f4a3820270842e90456ebad3af15810cf65932bd24da4463bc0a4c"},
+ {file = "aiohttp-3.10.10-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:413251f6fcf552a33c981c4709a6bba37b12710982fec8e558ae944bfb2abd38"},
+ {file = "aiohttp-3.10.10-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d1720b4f14c78a3089562b8875b53e36b51c97c51adc53325a69b79b4b48ebcb"},
+ {file = "aiohttp-3.10.10-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:679abe5d3858b33c2cf74faec299fda60ea9de62916e8b67e625d65bf069a3b7"},
+ {file = "aiohttp-3.10.10-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:79019094f87c9fb44f8d769e41dbb664d6e8fcfd62f665ccce36762deaa0e911"},
+ {file = "aiohttp-3.10.10-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:fe2fb38c2ed905a2582948e2de560675e9dfbee94c6d5ccdb1301c6d0a5bf092"},
+ {file = "aiohttp-3.10.10-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:a3f00003de6eba42d6e94fabb4125600d6e484846dbf90ea8e48a800430cc142"},
+ {file = "aiohttp-3.10.10-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:1bbb122c557a16fafc10354b9d99ebf2f2808a660d78202f10ba9d50786384b9"},
+ {file = "aiohttp-3.10.10-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:30ca7c3b94708a9d7ae76ff281b2f47d8eaf2579cd05971b5dc681db8caac6e1"},
+ {file = "aiohttp-3.10.10-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:df9270660711670e68803107d55c2b5949c2e0f2e4896da176e1ecfc068b974a"},
+ {file = "aiohttp-3.10.10-cp311-cp311-win32.whl", hash = "sha256:aafc8ee9b742ce75044ae9a4d3e60e3d918d15a4c2e08a6c3c3e38fa59b92d94"},
+ {file = "aiohttp-3.10.10-cp311-cp311-win_amd64.whl", hash = "sha256:362f641f9071e5f3ee6f8e7d37d5ed0d95aae656adf4ef578313ee585b585959"},
+ {file = "aiohttp-3.10.10-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:9294bbb581f92770e6ed5c19559e1e99255e4ca604a22c5c6397b2f9dd3ee42c"},
+ {file = "aiohttp-3.10.10-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:a8fa23fe62c436ccf23ff930149c047f060c7126eae3ccea005f0483f27b2e28"},
+ {file = "aiohttp-3.10.10-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:5c6a5b8c7926ba5d8545c7dd22961a107526562da31a7a32fa2456baf040939f"},
+ {file = "aiohttp-3.10.10-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:007ec22fbc573e5eb2fb7dec4198ef8f6bf2fe4ce20020798b2eb5d0abda6138"},
+ {file = "aiohttp-3.10.10-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:9627cc1a10c8c409b5822a92d57a77f383b554463d1884008e051c32ab1b3742"},
+ {file = "aiohttp-3.10.10-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:50edbcad60d8f0e3eccc68da67f37268b5144ecc34d59f27a02f9611c1d4eec7"},
+ {file = "aiohttp-3.10.10-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a45d85cf20b5e0d0aa5a8dca27cce8eddef3292bc29d72dcad1641f4ed50aa16"},
+ {file = "aiohttp-3.10.10-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0b00807e2605f16e1e198f33a53ce3c4523114059b0c09c337209ae55e3823a8"},
+ {file = "aiohttp-3.10.10-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:f2d4324a98062be0525d16f768a03e0bbb3b9fe301ceee99611dc9a7953124e6"},
+ {file = "aiohttp-3.10.10-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:438cd072f75bb6612f2aca29f8bd7cdf6e35e8f160bc312e49fbecab77c99e3a"},
+ {file = "aiohttp-3.10.10-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:baa42524a82f75303f714108fea528ccacf0386af429b69fff141ffef1c534f9"},
+ {file = "aiohttp-3.10.10-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:a7d8d14fe962153fc681f6366bdec33d4356f98a3e3567782aac1b6e0e40109a"},
+ {file = "aiohttp-3.10.10-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:c1277cd707c465cd09572a774559a3cc7c7a28802eb3a2a9472588f062097205"},
+ {file = "aiohttp-3.10.10-cp312-cp312-win32.whl", hash = "sha256:59bb3c54aa420521dc4ce3cc2c3fe2ad82adf7b09403fa1f48ae45c0cbde6628"},
+ {file = "aiohttp-3.10.10-cp312-cp312-win_amd64.whl", hash = "sha256:0e1b370d8007c4ae31ee6db7f9a2fe801a42b146cec80a86766e7ad5c4a259cf"},
+ {file = "aiohttp-3.10.10-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:ad7593bb24b2ab09e65e8a1d385606f0f47c65b5a2ae6c551db67d6653e78c28"},
+ {file = "aiohttp-3.10.10-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:1eb89d3d29adaf533588f209768a9c02e44e4baf832b08118749c5fad191781d"},
+ {file = "aiohttp-3.10.10-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:3fe407bf93533a6fa82dece0e74dbcaaf5d684e5a51862887f9eaebe6372cd79"},
+ {file = "aiohttp-3.10.10-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:50aed5155f819873d23520919e16703fc8925e509abbb1a1491b0087d1cd969e"},
+ {file = "aiohttp-3.10.10-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:4f05e9727ce409358baa615dbeb9b969db94324a79b5a5cea45d39bdb01d82e6"},
+ {file = "aiohttp-3.10.10-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:3dffb610a30d643983aeb185ce134f97f290f8935f0abccdd32c77bed9388b42"},
+ {file = "aiohttp-3.10.10-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:aa6658732517ddabe22c9036479eabce6036655ba87a0224c612e1ae6af2087e"},
+ {file = "aiohttp-3.10.10-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:741a46d58677d8c733175d7e5aa618d277cd9d880301a380fd296975a9cdd7bc"},
+ {file = "aiohttp-3.10.10-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:e00e3505cd80440f6c98c6d69269dcc2a119f86ad0a9fd70bccc59504bebd68a"},
+ {file = "aiohttp-3.10.10-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:ffe595f10566f8276b76dc3a11ae4bb7eba1aac8ddd75811736a15b0d5311414"},
+ {file = "aiohttp-3.10.10-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:bdfcf6443637c148c4e1a20c48c566aa694fa5e288d34b20fcdc58507882fed3"},
+ {file = "aiohttp-3.10.10-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:d183cf9c797a5291e8301790ed6d053480ed94070637bfaad914dd38b0981f67"},
+ {file = "aiohttp-3.10.10-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:77abf6665ae54000b98b3c742bc6ea1d1fb31c394bcabf8b5d2c1ac3ebfe7f3b"},
+ {file = "aiohttp-3.10.10-cp313-cp313-win32.whl", hash = "sha256:4470c73c12cd9109db8277287d11f9dd98f77fc54155fc71a7738a83ffcc8ea8"},
+ {file = "aiohttp-3.10.10-cp313-cp313-win_amd64.whl", hash = "sha256:486f7aabfa292719a2753c016cc3a8f8172965cabb3ea2e7f7436c7f5a22a151"},
+ {file = "aiohttp-3.10.10-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:1b66ccafef7336a1e1f0e389901f60c1d920102315a56df85e49552308fc0486"},
+ {file = "aiohttp-3.10.10-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:acd48d5b80ee80f9432a165c0ac8cbf9253eaddb6113269a5e18699b33958dbb"},
+ {file = "aiohttp-3.10.10-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:3455522392fb15ff549d92fbf4b73b559d5e43dc522588f7eb3e54c3f38beee7"},
+ {file = "aiohttp-3.10.10-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:45c3b868724137f713a38376fef8120c166d1eadd50da1855c112fe97954aed8"},
+ {file = "aiohttp-3.10.10-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:da1dee8948d2137bb51fbb8a53cce6b1bcc86003c6b42565f008438b806cccd8"},
+ {file = "aiohttp-3.10.10-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:c5ce2ce7c997e1971b7184ee37deb6ea9922ef5163c6ee5aa3c274b05f9e12fa"},
+ {file = "aiohttp-3.10.10-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:28529e08fde6f12eba8677f5a8608500ed33c086f974de68cc65ab218713a59d"},
+ {file = "aiohttp-3.10.10-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f7db54c7914cc99d901d93a34704833568d86c20925b2762f9fa779f9cd2e70f"},
+ {file = "aiohttp-3.10.10-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:03a42ac7895406220124c88911ebee31ba8b2d24c98507f4a8bf826b2937c7f2"},
+ {file = "aiohttp-3.10.10-cp38-cp38-musllinux_1_2_i686.whl", hash = "sha256:7e338c0523d024fad378b376a79faff37fafb3c001872a618cde1d322400a572"},
+ {file = "aiohttp-3.10.10-cp38-cp38-musllinux_1_2_ppc64le.whl", hash = "sha256:038f514fe39e235e9fef6717fbf944057bfa24f9b3db9ee551a7ecf584b5b480"},
+ {file = "aiohttp-3.10.10-cp38-cp38-musllinux_1_2_s390x.whl", hash = "sha256:64f6c17757251e2b8d885d728b6433d9d970573586a78b78ba8929b0f41d045a"},
+ {file = "aiohttp-3.10.10-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:93429602396f3383a797a2a70e5f1de5df8e35535d7806c9f91df06f297e109b"},
+ {file = "aiohttp-3.10.10-cp38-cp38-win32.whl", hash = "sha256:c823bc3971c44ab93e611ab1a46b1eafeae474c0c844aff4b7474287b75fe49c"},
+ {file = "aiohttp-3.10.10-cp38-cp38-win_amd64.whl", hash = "sha256:54ca74df1be3c7ca1cf7f4c971c79c2daf48d9aa65dea1a662ae18926f5bc8ce"},
+ {file = "aiohttp-3.10.10-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:01948b1d570f83ee7bbf5a60ea2375a89dfb09fd419170e7f5af029510033d24"},
+ {file = "aiohttp-3.10.10-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:9fc1500fd2a952c5c8e3b29aaf7e3cc6e27e9cfc0a8819b3bce48cc1b849e4cc"},
+ {file = "aiohttp-3.10.10-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:f614ab0c76397661b90b6851a030004dac502e48260ea10f2441abd2207fbcc7"},
+ {file = "aiohttp-3.10.10-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:00819de9e45d42584bed046314c40ea7e9aea95411b38971082cad449392b08c"},
+ {file = "aiohttp-3.10.10-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:05646ebe6b94cc93407b3bf34b9eb26c20722384d068eb7339de802154d61bc5"},
+ {file = "aiohttp-3.10.10-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:998f3bd3cfc95e9424a6acd7840cbdd39e45bc09ef87533c006f94ac47296090"},
+ {file = "aiohttp-3.10.10-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d9010c31cd6fa59438da4e58a7f19e4753f7f264300cd152e7f90d4602449762"},
+ {file = "aiohttp-3.10.10-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7ea7ffc6d6d6f8a11e6f40091a1040995cdff02cfc9ba4c2f30a516cb2633554"},
+ {file = "aiohttp-3.10.10-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:ef9c33cc5cbca35808f6c74be11eb7f5f6b14d2311be84a15b594bd3e58b5527"},
+ {file = "aiohttp-3.10.10-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:ce0cdc074d540265bfeb31336e678b4e37316849d13b308607efa527e981f5c2"},
+ {file = "aiohttp-3.10.10-cp39-cp39-musllinux_1_2_ppc64le.whl", hash = "sha256:597a079284b7ee65ee102bc3a6ea226a37d2b96d0418cc9047490f231dc09fe8"},
+ {file = "aiohttp-3.10.10-cp39-cp39-musllinux_1_2_s390x.whl", hash = "sha256:7789050d9e5d0c309c706953e5e8876e38662d57d45f936902e176d19f1c58ab"},
+ {file = "aiohttp-3.10.10-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:e7f8b04d83483577fd9200461b057c9f14ced334dcb053090cea1da9c8321a91"},
+ {file = "aiohttp-3.10.10-cp39-cp39-win32.whl", hash = "sha256:c02a30b904282777d872266b87b20ed8cc0d1501855e27f831320f471d54d983"},
+ {file = "aiohttp-3.10.10-cp39-cp39-win_amd64.whl", hash = "sha256:edfe3341033a6b53a5c522c802deb2079eee5cbfbb0af032a55064bd65c73a23"},
+ {file = "aiohttp-3.10.10.tar.gz", hash = "sha256:0631dd7c9f0822cc61c88586ca76d5b5ada26538097d0f1df510b082bad3411a"},
+]
+
+[package.dependencies]
+aiohappyeyeballs = ">=2.3.0"
aiosignal = ">=1.1.2"
-async-timeout = ">=4.0.0a3,<5.0"
+async-timeout = {version = ">=4.0,<5.0", markers = "python_version < \"3.11\""}
attrs = ">=17.3.0"
-charset-normalizer = ">=2.0,<4.0"
frozenlist = ">=1.1.1"
multidict = ">=4.5,<7.0"
-yarl = ">=1.0,<2.0"
+yarl = ">=1.12.0,<2.0"
[package.extras]
-speedups = ["Brotli", "aiodns", "cchardet"]
+speedups = ["Brotli", "aiodns (>=3.2.0)", "brotlicffi"]
[[package]]
name = "aioitertools"
@@ -214,6 +229,20 @@ files = [
[package.dependencies]
frozenlist = ">=1.1.0"
+[[package]]
+name = "aiostream"
+version = "0.5.2"
+description = "Generator-based operators for asynchronous iteration"
+optional = false
+python-versions = ">=3.8"
+files = [
+ {file = "aiostream-0.5.2-py3-none-any.whl", hash = "sha256:054660370be9d37f6fe3ece3851009240416bd082e469fd90cc8673d3818cf71"},
+ {file = "aiostream-0.5.2.tar.gz", hash = "sha256:b71b519a2d66c38f0872403ab86417955b77352f08d9ad02ad46fc3926b389f4"},
+]
+
+[package.dependencies]
+typing-extensions = "*"
+
[[package]]
name = "alembic"
version = "1.13.2"
@@ -2167,32 +2196,33 @@ typing-extensions = ">=3.10.0"
[[package]]
name = "databricks-sql-connector"
-version = "3.3.0"
+version = "2.9.6"
description = "Databricks SQL Connector for Python"
optional = true
-python-versions = "<4.0.0,>=3.8.0"
+python-versions = "<4.0.0,>=3.7.1"
files = [
- {file = "databricks_sql_connector-3.3.0-py3-none-any.whl", hash = "sha256:55ee5a4a11291bf91a235ac76e41b419ddd66a9a321065a8bfaf119acbb26d6b"},
- {file = "databricks_sql_connector-3.3.0.tar.gz", hash = "sha256:19e82965da4c86574adfe9f788c17b4494d98eb8075ba4fd4306573d2edbf194"},
+ {file = "databricks_sql_connector-2.9.6-py3-none-any.whl", hash = "sha256:d830abf86e71d2eb83c6a7b7264d6c03926a8a83cec58541ddd6b83d693bde8f"},
+ {file = "databricks_sql_connector-2.9.6.tar.gz", hash = "sha256:e55f5b8ede8ae6c6f31416a4cf6352f0ac019bf6875896c668c7574ceaf6e813"},
]
[package.dependencies]
+alembic = ">=1.0.11,<2.0.0"
lz4 = ">=4.0.2,<5.0.0"
numpy = [
- {version = ">=1.16.6,<2.0.0", markers = "python_version >= \"3.8\" and python_version < \"3.11\""},
- {version = ">=1.23.4,<2.0.0", markers = "python_version >= \"3.11\""},
+ {version = ">=1.16.6", markers = "python_version >= \"3.7\" and python_version < \"3.11\""},
+ {version = ">=1.23.4", markers = "python_version >= \"3.11\""},
]
oauthlib = ">=3.1.0,<4.0.0"
openpyxl = ">=3.0.10,<4.0.0"
-pandas = {version = ">=1.2.5,<2.2.0", markers = "python_version >= \"3.8\""}
-pyarrow = ">=14.0.1,<17"
+pandas = {version = ">=1.2.5,<3.0.0", markers = "python_version >= \"3.8\""}
+pyarrow = [
+ {version = ">=6.0.0", markers = "python_version >= \"3.7\" and python_version < \"3.11\""},
+ {version = ">=10.0.1", markers = "python_version >= \"3.11\""},
+]
requests = ">=2.18.1,<3.0.0"
-thrift = ">=0.16.0,<0.21.0"
-urllib3 = ">=1.26"
-
-[package.extras]
-alembic = ["alembic (>=1.0.11,<2.0.0)", "sqlalchemy (>=2.0.21)"]
-sqlalchemy = ["sqlalchemy (>=2.0.21)"]
+sqlalchemy = ">=1.3.24,<2.0.0"
+thrift = ">=0.16.0,<0.17.0"
+urllib3 = ">=1.0"
[[package]]
name = "db-dtypes"
@@ -2459,17 +2489,17 @@ files = [
[[package]]
name = "deltalake"
-version = "0.19.1"
+version = "0.21.0"
description = "Native Delta Lake Python binding based on delta-rs with Pandas integration"
optional = true
python-versions = ">=3.8"
files = [
- {file = "deltalake-0.19.1-cp38-abi3-macosx_10_12_x86_64.whl", hash = "sha256:ddaaaa9c85a17791c3997cf320ac11dc1725d16cf4b6f0ff1b130853e7b56cd0"},
- {file = "deltalake-0.19.1-cp38-abi3-macosx_11_0_arm64.whl", hash = "sha256:e0184d5a3f0d4f4f1fb992c3bdc8736329b78b6a4faf1a278109ec35d9945c1d"},
- {file = "deltalake-0.19.1-cp38-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ec9d117fcf6c198f3d554be2f3a6291ca3838530650db236741ff48d4d47abb4"},
- {file = "deltalake-0.19.1-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:447ef721319ed15f7b5f6da507efd5fed0e6172e5ae55ac044d5b8fc9b812e47"},
- {file = "deltalake-0.19.1-cp38-abi3-win_amd64.whl", hash = "sha256:b15bc343a9f8f3de80fbedcebd5d9472b539eb0f538a71739c7fcf699089127e"},
- {file = "deltalake-0.19.1.tar.gz", hash = "sha256:5e09fabb221fb81e989c283c16278eaffb6e85706d98364abcda5c0c6ca73598"},
+ {file = "deltalake-0.21.0-cp38-abi3-macosx_10_12_x86_64.whl", hash = "sha256:4b4a352da534a173d837906357c808cd36dd216974b9df2c3998acb98d04954a"},
+ {file = "deltalake-0.21.0-cp38-abi3-macosx_11_0_arm64.whl", hash = "sha256:2e7762af03fed0a67ce8f89b6b75356a74e485f52923c73a5c850c6c574d3481"},
+ {file = "deltalake-0.21.0-cp38-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:02870db9dfe4694034fb209cb4822f85ba1a11885e353e11fd0dfb51b0af67d2"},
+ {file = "deltalake-0.21.0-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:92ccd486243dbe17c45a6ca06b108351dfd07708d34e75405fcb13033c63d176"},
+ {file = "deltalake-0.21.0-cp38-abi3-win_amd64.whl", hash = "sha256:dc84334a0f8df1f4f5c6ca9aaffbeffb896f43de6744d0c77c306ca8ba27041b"},
+ {file = "deltalake-0.21.0.tar.gz", hash = "sha256:88f92cede44b2737431dbf86d43ed3b3c8cb73db56e99138aea1a6d93e9c6821"},
]
[package.dependencies]
@@ -2653,57 +2683,63 @@ dates = ["pytz (>=2019.1)"]
[[package]]
name = "duckdb"
-version = "1.1.0"
+version = "1.1.2"
description = "DuckDB in-process database"
optional = false
python-versions = ">=3.7.0"
files = [
- {file = "duckdb-1.1.0-cp310-cp310-macosx_12_0_arm64.whl", hash = "sha256:5e4cbc408e6e41146dea89b9044dae7356e353db0c96b183e5583ee02bc6ae5d"},
- {file = "duckdb-1.1.0-cp310-cp310-macosx_12_0_universal2.whl", hash = "sha256:6370ae27ec8167ccfbefb94f58ad9fdc7bac142399960549d6d367f233189868"},
- {file = "duckdb-1.1.0-cp310-cp310-macosx_12_0_x86_64.whl", hash = "sha256:4e1c3414f7fd01f4810dc8b335deffc91933a159282d65fef11c1286bc0ded04"},
- {file = "duckdb-1.1.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c6bc2a58689adf5520303c5f68b065b9f980bd31f1366c541b8c7490abaf55cd"},
- {file = "duckdb-1.1.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d02be208d2885ca085d4c852b911493b8cdac9d6eae893259da32bd72a437c25"},
- {file = "duckdb-1.1.0-cp310-cp310-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:655df442ceebfc6f3fd6c8766e04b60d44dddedfa90275d794f9fab2d3180879"},
- {file = "duckdb-1.1.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:6e183729bb64be7798ccbfda6283ebf423c869268c25af2b56929e48f763be2f"},
- {file = "duckdb-1.1.0-cp310-cp310-win_amd64.whl", hash = "sha256:61fb838da51e07ceb0222c4406b059b90e10efcc453c19a3650b73c0112138c4"},
- {file = "duckdb-1.1.0-cp311-cp311-macosx_12_0_arm64.whl", hash = "sha256:7807e2f0d3344668e433f0dc1f54bfaddd410589611393e9a7ed56f8dec9514f"},
- {file = "duckdb-1.1.0-cp311-cp311-macosx_12_0_universal2.whl", hash = "sha256:3da30b7b466f710d52caa1fdc3ef0bf4176ad7f115953cd9f8b0fbf0f723778f"},
- {file = "duckdb-1.1.0-cp311-cp311-macosx_12_0_x86_64.whl", hash = "sha256:b9b6a77ef0183f561b1fc2945fcc762a71570ffd33fea4e3a855d413ed596fe4"},
- {file = "duckdb-1.1.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:16243e66a9fd0e64ee265f2634d137adc6593f54ddf3ef55cb8a29e1decf6e54"},
- {file = "duckdb-1.1.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:42b910a149e00f40a1766dc74fa309d4255b912a5d2fdcc387287658048650f6"},
- {file = "duckdb-1.1.0-cp311-cp311-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:47849d546dc4238c0f20e95fe53b621aa5b08684e68fff91fd84a7092be91a17"},
- {file = "duckdb-1.1.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:11ec967b67159361ceade34095796a8d19368ea5c30cad988f44896b082b0816"},
- {file = "duckdb-1.1.0-cp311-cp311-win_amd64.whl", hash = "sha256:510b5885ed6c267b9c0e1e7c6138fdffc2dd6f934a5a95b76da85da127213338"},
- {file = "duckdb-1.1.0-cp312-cp312-macosx_12_0_arm64.whl", hash = "sha256:657bc7ac64d5faf069a782ae73afac51ef30ae2e5d0e09ce6a09d03db84ab35e"},
- {file = "duckdb-1.1.0-cp312-cp312-macosx_12_0_universal2.whl", hash = "sha256:89f3de8cba57d19b41cd3c47dd06d979bd2a2ffead115480e37afbe72b02896d"},
- {file = "duckdb-1.1.0-cp312-cp312-macosx_12_0_x86_64.whl", hash = "sha256:f6486323ab20656d22ffa8f3c6e109dde30d0b327b7c831f22ebcfe747f97fb0"},
- {file = "duckdb-1.1.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:78a4510f82431ee3f14db689fe8727a4a9062c8f2fbb3bcfe3bfad3c1a198004"},
- {file = "duckdb-1.1.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:64bf2a6e23840d662bd2ac09206a9bd4fa657418884d69e5c352d4456dc70b3c"},
- {file = "duckdb-1.1.0-cp312-cp312-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:23fc9aa0af74e3803ed90c8d98280fd5bcac8c940592bf6288e8fd60fb051d00"},
- {file = "duckdb-1.1.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:1f3aea31341ce400640dd522e4399b941f66df17e39884f446638fe958d6117c"},
- {file = "duckdb-1.1.0-cp312-cp312-win_amd64.whl", hash = "sha256:3db4ab31c20de4edaef152930836b38e7662cd71370748fdf2c38ba9cf854dc4"},
- {file = "duckdb-1.1.0-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e3b6b4fe1edfe35f64f403a9f0ab75258cee35abd964356893ee37424174b7e4"},
- {file = "duckdb-1.1.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:aad02f50d5a2020822d1638fc1a9bcf082056f11d2e15ccfc1c1ed4d0f85a3be"},
- {file = "duckdb-1.1.0-cp37-cp37m-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:eb66e9e7391801928ea134dcab12d2e4c97f2ce0391c603a3e480bbb15830bc8"},
- {file = "duckdb-1.1.0-cp37-cp37m-musllinux_1_2_x86_64.whl", hash = "sha256:069fb7bca459e31edb32a61f0eea95d7a8a766bef7b8318072563abf8e939593"},
- {file = "duckdb-1.1.0-cp37-cp37m-win_amd64.whl", hash = "sha256:e39f9b7b62e64e10d421ff04480290a70129c38067d1a4f600e9212b10542c5a"},
- {file = "duckdb-1.1.0-cp38-cp38-macosx_12_0_arm64.whl", hash = "sha256:55ef98bcc7ba745752607f1b926e8d9b7ce32c42c423bbad10c44820aefe23a7"},
- {file = "duckdb-1.1.0-cp38-cp38-macosx_12_0_universal2.whl", hash = "sha256:e2a08175e43b865c1e9611efd18cacd29ddd69093de442b1ebdf312071df7719"},
- {file = "duckdb-1.1.0-cp38-cp38-macosx_12_0_x86_64.whl", hash = "sha256:0e3644b1f034012d82b9baa12a7ea306fe71dc6623731b28c753c4a617ff9499"},
- {file = "duckdb-1.1.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:211a33c1ddb5cc609f75eb43772b0b03b45d2fa89bec107e4715267ca907806a"},
- {file = "duckdb-1.1.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8e74b6f8a5145abbf7e6c1a2a61f0adbcd493c19b358f524ec9a3cebdf362abb"},
- {file = "duckdb-1.1.0-cp38-cp38-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:58f1633dd2c5af5088ae2d119418e200855d0699d84f2fae9d46d30f404bcead"},
- {file = "duckdb-1.1.0-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:d18caea926b1e301c29b140418fca697aad728129e269b4f82c2795a184549e1"},
- {file = "duckdb-1.1.0-cp38-cp38-win_amd64.whl", hash = "sha256:cd9fb1408942411ad360f8414bc3fbf0091c396ca903d947a10f2e31324d5cbd"},
- {file = "duckdb-1.1.0-cp39-cp39-macosx_12_0_arm64.whl", hash = "sha256:bd11bc899cebf5ff936d1276a2dfb7b7db08aba3bcc42924afeafc2163bddb43"},
- {file = "duckdb-1.1.0-cp39-cp39-macosx_12_0_universal2.whl", hash = "sha256:53825a63193c582a78c152ea53de8d145744ddbeea18f452625a82ebc33eb14a"},
- {file = "duckdb-1.1.0-cp39-cp39-macosx_12_0_x86_64.whl", hash = "sha256:29dc18087de47563b3859a6b98bbed96e1c96ce5db829646dc3b16a916997e7d"},
- {file = "duckdb-1.1.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ecb19319883564237a7a03a104dbe7f445e73519bb67108fcab3d19b6b91fe30"},
- {file = "duckdb-1.1.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:aac2fcabe2d5072c252d0b3087365f431de812d8199705089fb073e4d039d19c"},
- {file = "duckdb-1.1.0-cp39-cp39-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:d89eaaa5df8a57e7d2bc1f4c46493bb1fee319a00155f2015810ad2ace6570ae"},
- {file = "duckdb-1.1.0-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:d86a6926313913cd2cc7e08816d3e7f72ba340adf2959279b1a80058be6526d9"},
- {file = "duckdb-1.1.0-cp39-cp39-win_amd64.whl", hash = "sha256:d8333f3e85fa2a0f1c222b752c2bd42ea875235ff88492f7bcbb6867d0f644eb"},
- {file = "duckdb-1.1.0.tar.gz", hash = "sha256:b4d4c12b1f98732151bd31377753e0da1a20f6423016d2d097d2e31953ec7c23"},
+ {file = "duckdb-1.1.2-cp310-cp310-macosx_12_0_arm64.whl", hash = "sha256:91e7f99cf5cab1d26f92cb014429153497d805e79689baa44f4c4585a8cb243f"},
+ {file = "duckdb-1.1.2-cp310-cp310-macosx_12_0_universal2.whl", hash = "sha256:0107de622fe208142a1108263a03c43956048dcc99be3702d8e5d2aeaf99554c"},
+ {file = "duckdb-1.1.2-cp310-cp310-macosx_12_0_x86_64.whl", hash = "sha256:8a09610f780857677725897856f8cdf3cafd8a991f871e6cb8ba88b2dbc8d737"},
+ {file = "duckdb-1.1.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c0f0ddac0482f0f3fece54d720d13819e82ae26c01a939ffa66a87be53f7f665"},
+ {file = "duckdb-1.1.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:84103373e818758dfa361d27781d0f096553843c5ffb9193260a0786c5248270"},
+ {file = "duckdb-1.1.2-cp310-cp310-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:bfdfd23e2bf58014ad0673973bd0ed88cd048dfe8e82420814a71d7d52ef2288"},
+ {file = "duckdb-1.1.2-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:25889e6e29b87047b1dd56385ac08156e4713c59326cc6fff89657d01b2c417b"},
+ {file = "duckdb-1.1.2-cp310-cp310-win_amd64.whl", hash = "sha256:312570fa5277c3079de18388b86c2d87cbe1044838bb152b235c0227581d5d42"},
+ {file = "duckdb-1.1.2-cp311-cp311-macosx_12_0_arm64.whl", hash = "sha256:568439ea4fce8cb72ec1f767cd510686a9e7e29a011fc7c56d990059a6e94e48"},
+ {file = "duckdb-1.1.2-cp311-cp311-macosx_12_0_universal2.whl", hash = "sha256:74974f2d7210623a5d61b1fb0cb589c6e5ffcbf7dbb757a04c5ba24adcfc8cac"},
+ {file = "duckdb-1.1.2-cp311-cp311-macosx_12_0_x86_64.whl", hash = "sha256:e26422a3358c816d764639070945b73eef55d1b4df990989e3492c85ef725c21"},
+ {file = "duckdb-1.1.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:87e972bd452eeeab197fe39dcaeecdb7c264b1f75a0ee67e532e235fe45b84df"},
+ {file = "duckdb-1.1.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9a6b73e70b73c8df85da383f6e557c03cad5c877868b9a7e41715761e8166c1e"},
+ {file = "duckdb-1.1.2-cp311-cp311-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:623cb1952466aae5907af84107bcdec25a5ca021a8b6441e961f41edc724f6f2"},
+ {file = "duckdb-1.1.2-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:d9fc0b550f96901fa7e76dc70a13f6477ad3e18ef1cb21d414c3a5569de3f27e"},
+ {file = "duckdb-1.1.2-cp311-cp311-win_amd64.whl", hash = "sha256:181edb1973bd8f493bcb6ecfa035f1a592dff4667758592f300619012ba251c0"},
+ {file = "duckdb-1.1.2-cp312-cp312-macosx_12_0_arm64.whl", hash = "sha256:83372b1b411086cac01ab2071122772fa66170b1b41ddbc37527464066083668"},
+ {file = "duckdb-1.1.2-cp312-cp312-macosx_12_0_universal2.whl", hash = "sha256:db37441deddfee6ac35a0c742d2f9e90e4e50b9e76d586a060d122b8fc56dada"},
+ {file = "duckdb-1.1.2-cp312-cp312-macosx_12_0_x86_64.whl", hash = "sha256:19142a77e72874aeaa6fda30aeb13612c6de5e8c60fbcc3392cea6ef0694eeaf"},
+ {file = "duckdb-1.1.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:099d99dd48d6e4682a3dd6233ceab73d977ebe1a87afaac54cf77c844e24514a"},
+ {file = "duckdb-1.1.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:be86e586ca7af7e807f72479a2b8d0983565360b19dbda4ef8a9d7b3909b8e2c"},
+ {file = "duckdb-1.1.2-cp312-cp312-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:578e0953e4d8ba8da0cd69fb2930c45f51ce47d213b77d8a4cd461f9c0960b87"},
+ {file = "duckdb-1.1.2-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:72b5eb5762c1a5e68849c7143f3b3747a9f15c040e34e41559f233a1569ad16f"},
+ {file = "duckdb-1.1.2-cp312-cp312-win_amd64.whl", hash = "sha256:9b4c6b6a08180261d98330d97355503961a25ca31cd9ef296e0681f7895b4a2c"},
+ {file = "duckdb-1.1.2-cp313-cp313-macosx_12_0_arm64.whl", hash = "sha256:695dcbc561374b126e86659709feadf883c9969ed718e94713edd4ba15d16619"},
+ {file = "duckdb-1.1.2-cp313-cp313-macosx_12_0_universal2.whl", hash = "sha256:ada29be1e889f486c6cf1f6dffd15463e748faf361f33996f2e862779edc24a9"},
+ {file = "duckdb-1.1.2-cp313-cp313-macosx_12_0_x86_64.whl", hash = "sha256:6ca722738fa9eb6218619740631de29acfdd132de6f6a6350fee5e291c2f6117"},
+ {file = "duckdb-1.1.2-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c796d33f1e5a0c8c570d22da0c0b1db8578687e427029e1ce2c8ce3f9fffa6a3"},
+ {file = "duckdb-1.1.2-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f5c0996988a70dd3bc8111d9b9aeab7e38ed1999a52607c5f1b528e362b4dd1c"},
+ {file = "duckdb-1.1.2-cp313-cp313-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:6c37b039f6d6fed14d89450f5ccf54922b3304192d7412e12d6cc8d9e757f7a2"},
+ {file = "duckdb-1.1.2-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:e8c766b87f675c76d6d17103bf6fb9fb1a9e2fcb3d9b25c28bbc634bde31223e"},
+ {file = "duckdb-1.1.2-cp313-cp313-win_amd64.whl", hash = "sha256:e3e6300b7ccaf64b609f4f0780a6e1d25ab8cf34cceed46e62c35b6c4c5cb63b"},
+ {file = "duckdb-1.1.2-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8a01fae9604a54ecbc26e7503c522311f15afbd2870e6d8f6fbef4545dfae550"},
+ {file = "duckdb-1.1.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:492b1d86a696428bd3f14dc1c7c3230e2dbca8978f288be64b04a26e0e00fad5"},
+ {file = "duckdb-1.1.2-cp37-cp37m-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:1bba58459ad897a78c4e478a097626fc266459a40338cecc68a49a8d5dc72fb7"},
+ {file = "duckdb-1.1.2-cp37-cp37m-win_amd64.whl", hash = "sha256:d395a3bf510bf24686821eec15802624797dcb33e8f14f8a7cc8e17d909474af"},
+ {file = "duckdb-1.1.2-cp38-cp38-macosx_12_0_arm64.whl", hash = "sha256:fd800f75728727fe699ed1eb22b636867cf48c9dd105ee88b977e20c89df4509"},
+ {file = "duckdb-1.1.2-cp38-cp38-macosx_12_0_universal2.whl", hash = "sha256:d8caaf43909e49537e26df51d80d075ae2b25a610d28ed8bd31d6ccebeaf3c65"},
+ {file = "duckdb-1.1.2-cp38-cp38-macosx_12_0_x86_64.whl", hash = "sha256:564166811c68d9c7f9911eb707ad32ec9c2507b98336d894fbe658b85bf1c697"},
+ {file = "duckdb-1.1.2-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:19386aa09f0d6f97634ba2972096d1c80d880176dfb0e949eadc91c98262a663"},
+ {file = "duckdb-1.1.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e9e8387bcc9a591ad14011ddfec0d408d1d9b1889c6c9b495a04c7016a24b9b3"},
+ {file = "duckdb-1.1.2-cp38-cp38-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:f8c5ff4970403ed3ff0ac71fe0ce1e6be3199df9d542afc84c424b444ba4ffe8"},
+ {file = "duckdb-1.1.2-cp38-cp38-win_amd64.whl", hash = "sha256:9283dcca87c3260eb631a99d738fa72b8545ed45b475bc72ad254f7310e14284"},
+ {file = "duckdb-1.1.2-cp39-cp39-macosx_12_0_arm64.whl", hash = "sha256:f87edaf20001530e63a4f7bda13b55dc3152d7171226915f2bf34e0813c8759e"},
+ {file = "duckdb-1.1.2-cp39-cp39-macosx_12_0_universal2.whl", hash = "sha256:efec169b3fe0b821e3207ba3e445f227d42dd62b4440ff79c37fa168a4fc5a71"},
+ {file = "duckdb-1.1.2-cp39-cp39-macosx_12_0_x86_64.whl", hash = "sha256:89164a2d29d56605a95ee5032aa415dd487028c4fd3e06d971497840e74c56e7"},
+ {file = "duckdb-1.1.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6858e10c60ff7e70e61d3dd53d2545c8b2609942e45fd6de38cd0dee52932de3"},
+ {file = "duckdb-1.1.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7ca967c5a57b1d0cb0fd5e539ab24110e5a59dcbedd365bb2dc80533d6e44a8d"},
+ {file = "duckdb-1.1.2-cp39-cp39-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:4ce949f1d7999aa6a046eb64067eee41d4c5c2872ba4fa408c9947742d0c7231"},
+ {file = "duckdb-1.1.2-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:9ba6d1f918e6ca47a368a0c32806016405cb9beb2c245806b0ca998f569d2bdf"},
+ {file = "duckdb-1.1.2-cp39-cp39-win_amd64.whl", hash = "sha256:7111fd3e7b334a7be383313ce29918b7c643e4f6ef44d6d63c3ab3fa6716c114"},
+ {file = "duckdb-1.1.2.tar.gz", hash = "sha256:c8232861dc8ec6daa29067056d5a0e5789919f2ab22ab792787616d7cd52f02a"},
]
[[package]]
@@ -2761,6 +2797,26 @@ files = [
[package.extras]
test = ["pytest (>=6)"]
+[[package]]
+name = "fastapi"
+version = "0.115.4"
+description = "FastAPI framework, high performance, easy to learn, fast to code, ready for production"
+optional = false
+python-versions = ">=3.8"
+files = [
+ {file = "fastapi-0.115.4-py3-none-any.whl", hash = "sha256:0b504a063ffb3cf96a5e27dc1bc32c80ca743a2528574f9cdc77daa2d31b4742"},
+ {file = "fastapi-0.115.4.tar.gz", hash = "sha256:db653475586b091cb8b2fec2ac54a680ac6a158e07406e1abae31679e8826349"},
+]
+
+[package.dependencies]
+pydantic = ">=1.7.4,<1.8 || >1.8,<1.8.1 || >1.8.1,<2.0.0 || >2.0.0,<2.0.1 || >2.0.1,<2.1.0 || >2.1.0,<3.0.0"
+starlette = ">=0.40.0,<0.42.0"
+typing-extensions = ">=4.8.0"
+
+[package.extras]
+all = ["email-validator (>=2.0.0)", "fastapi-cli[standard] (>=0.0.5)", "httpx (>=0.23.0)", "itsdangerous (>=1.1.0)", "jinja2 (>=2.11.2)", "orjson (>=3.2.1)", "pydantic-extra-types (>=2.0.0)", "pydantic-settings (>=2.0.0)", "python-multipart (>=0.0.7)", "pyyaml (>=5.3.1)", "ujson (>=4.0.1,!=4.0.2,!=4.1.0,!=4.2.0,!=4.3.0,!=5.0.0,!=5.1.0)", "uvicorn[standard] (>=0.12.0)"]
+standard = ["email-validator (>=2.0.0)", "fastapi-cli[standard] (>=0.0.5)", "httpx (>=0.23.0)", "jinja2 (>=2.11.2)", "python-multipart (>=0.0.7)", "uvicorn[standard] (>=0.12.0)"]
+
[[package]]
name = "fastembed"
version = "0.2.6"
@@ -4086,6 +4142,23 @@ grpcio = ">=1.57.0"
protobuf = ">=4.21.6,<5.0dev"
setuptools = "*"
+[[package]]
+name = "grpclib"
+version = "0.4.7"
+description = "Pure-Python gRPC implementation for asyncio"
+optional = false
+python-versions = ">=3.7"
+files = [
+ {file = "grpclib-0.4.7.tar.gz", hash = "sha256:2988ef57c02b22b7a2e8e961792c41ccf97efc2ace91ae7a5b0de03c363823c3"},
+]
+
+[package.dependencies]
+h2 = ">=3.1.0,<5"
+multidict = "*"
+
+[package.extras]
+protobuf = ["protobuf (>=3.20.0)"]
+
[[package]]
name = "gunicorn"
version = "21.2.0"
@@ -4121,7 +4194,7 @@ files = [
name = "h2"
version = "4.1.0"
description = "HTTP/2 State-Machine based protocol implementation"
-optional = true
+optional = false
python-versions = ">=3.6.1"
files = [
{file = "h2-4.1.0-py3-none-any.whl", hash = "sha256:03a46bcf682256c95b5fd9e9a99c1323584c3eec6440d379b9903d709476bc6d"},
@@ -4153,7 +4226,7 @@ test = ["eth-utils (>=1.0.1,<3)", "hypothesis (>=3.44.24,<=6.31.6)", "pytest (>=
name = "hpack"
version = "4.0.0"
description = "Pure-Python HPACK header compression"
-optional = true
+optional = false
python-versions = ">=3.6.1"
files = [
{file = "hpack-4.0.0-py3-none-any.whl", hash = "sha256:84a076fad3dc9a9f8063ccb8041ef100867b1878b25ef0ee63847a5d53818a6c"},
@@ -4283,7 +4356,7 @@ tests = ["freezegun", "pytest", "pytest-cov"]
name = "hyperframe"
version = "6.0.1"
description = "HTTP/2 framing layer for Python"
-optional = true
+optional = false
python-versions = ">=3.6.1"
files = [
{file = "hyperframe-6.0.1-py3-none-any.whl", hash = "sha256:0ec6bafd80d8ad2195c4f03aacba3a8265e57bc4cff261e802bf39970ed02a15"},
@@ -5349,6 +5422,33 @@ files = [
[package.extras]
test = ["mypy (>=1.0)", "pytest (>=7.0.0)"]
+[[package]]
+name = "modal"
+version = "0.65.33"
+description = "Python client library for Modal"
+optional = false
+python-versions = ">=3.8"
+files = [
+ {file = "modal-0.65.33-py3-none-any.whl", hash = "sha256:083d49ef79ae8dcc8370d220bf9c569050c4241474c0970735d0fead3886bd4e"},
+]
+
+[package.dependencies]
+aiohttp = "*"
+aiostream = ">=0.5.2,<0.6.0"
+certifi = "*"
+click = ">=8.1.0"
+fastapi = "*"
+grpclib = "0.4.7"
+protobuf = ">=3.19,<4.24.0 || >4.24.0,<6.0"
+rich = ">=12.0.0"
+synchronicity = ">=0.9.3,<0.10.0"
+toml = "*"
+typer = ">=0.9"
+types-certifi = "*"
+types-toml = "*"
+typing-extensions = ">=4.6,<5.0"
+watchfiles = "*"
+
[[package]]
name = "more-itertools"
version = "10.1.0"
@@ -6593,6 +6693,113 @@ six = "*"
[package.extras]
dev = ["nose", "pipreqs", "twine"]
+[[package]]
+name = "propcache"
+version = "0.2.0"
+description = "Accelerated property cache"
+optional = false
+python-versions = ">=3.8"
+files = [
+ {file = "propcache-0.2.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:c5869b8fd70b81835a6f187c5fdbe67917a04d7e52b6e7cc4e5fe39d55c39d58"},
+ {file = "propcache-0.2.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:952e0d9d07609d9c5be361f33b0d6d650cd2bae393aabb11d9b719364521984b"},
+ {file = "propcache-0.2.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:33ac8f098df0585c0b53009f039dfd913b38c1d2edafed0cedcc0c32a05aa110"},
+ {file = "propcache-0.2.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:97e48e8875e6c13909c800fa344cd54cc4b2b0db1d5f911f840458a500fde2c2"},
+ {file = "propcache-0.2.0-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:388f3217649d6d59292b722d940d4d2e1e6a7003259eb835724092a1cca0203a"},
+ {file = "propcache-0.2.0-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f571aea50ba5623c308aa146eb650eebf7dbe0fd8c5d946e28343cb3b5aad577"},
+ {file = "propcache-0.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3dfafb44f7bb35c0c06eda6b2ab4bfd58f02729e7c4045e179f9a861b07c9850"},
+ {file = "propcache-0.2.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a3ebe9a75be7ab0b7da2464a77bb27febcb4fab46a34f9288f39d74833db7f61"},
+ {file = "propcache-0.2.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:d2f0d0f976985f85dfb5f3d685697ef769faa6b71993b46b295cdbbd6be8cc37"},
+ {file = "propcache-0.2.0-cp310-cp310-musllinux_1_2_armv7l.whl", hash = "sha256:a3dc1a4b165283bd865e8f8cb5f0c64c05001e0718ed06250d8cac9bec115b48"},
+ {file = "propcache-0.2.0-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:9e0f07b42d2a50c7dd2d8675d50f7343d998c64008f1da5fef888396b7f84630"},
+ {file = "propcache-0.2.0-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:e63e3e1e0271f374ed489ff5ee73d4b6e7c60710e1f76af5f0e1a6117cd26394"},
+ {file = "propcache-0.2.0-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:56bb5c98f058a41bb58eead194b4db8c05b088c93d94d5161728515bd52b052b"},
+ {file = "propcache-0.2.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:7665f04d0c7f26ff8bb534e1c65068409bf4687aa2534faf7104d7182debb336"},
+ {file = "propcache-0.2.0-cp310-cp310-win32.whl", hash = "sha256:7cf18abf9764746b9c8704774d8b06714bcb0a63641518a3a89c7f85cc02c2ad"},
+ {file = "propcache-0.2.0-cp310-cp310-win_amd64.whl", hash = "sha256:cfac69017ef97db2438efb854edf24f5a29fd09a536ff3a992b75990720cdc99"},
+ {file = "propcache-0.2.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:63f13bf09cc3336eb04a837490b8f332e0db41da66995c9fd1ba04552e516354"},
+ {file = "propcache-0.2.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:608cce1da6f2672a56b24a015b42db4ac612ee709f3d29f27a00c943d9e851de"},
+ {file = "propcache-0.2.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:466c219deee4536fbc83c08d09115249db301550625c7fef1c5563a584c9bc87"},
+ {file = "propcache-0.2.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:fc2db02409338bf36590aa985a461b2c96fce91f8e7e0f14c50c5fcc4f229016"},
+ {file = "propcache-0.2.0-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:a6ed8db0a556343d566a5c124ee483ae113acc9a557a807d439bcecc44e7dfbb"},
+ {file = "propcache-0.2.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:91997d9cb4a325b60d4e3f20967f8eb08dfcb32b22554d5ef78e6fd1dda743a2"},
+ {file = "propcache-0.2.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4c7dde9e533c0a49d802b4f3f218fa9ad0a1ce21f2c2eb80d5216565202acab4"},
+ {file = "propcache-0.2.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ffcad6c564fe6b9b8916c1aefbb37a362deebf9394bd2974e9d84232e3e08504"},
+ {file = "propcache-0.2.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:97a58a28bcf63284e8b4d7b460cbee1edaab24634e82059c7b8c09e65284f178"},
+ {file = "propcache-0.2.0-cp311-cp311-musllinux_1_2_armv7l.whl", hash = "sha256:945db8ee295d3af9dbdbb698cce9bbc5c59b5c3fe328bbc4387f59a8a35f998d"},
+ {file = "propcache-0.2.0-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:39e104da444a34830751715f45ef9fc537475ba21b7f1f5b0f4d71a3b60d7fe2"},
+ {file = "propcache-0.2.0-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:c5ecca8f9bab618340c8e848d340baf68bcd8ad90a8ecd7a4524a81c1764b3db"},
+ {file = "propcache-0.2.0-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:c436130cc779806bdf5d5fae0d848713105472b8566b75ff70048c47d3961c5b"},
+ {file = "propcache-0.2.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:191db28dc6dcd29d1a3e063c3be0b40688ed76434622c53a284e5427565bbd9b"},
+ {file = "propcache-0.2.0-cp311-cp311-win32.whl", hash = "sha256:5f2564ec89058ee7c7989a7b719115bdfe2a2fb8e7a4543b8d1c0cc4cf6478c1"},
+ {file = "propcache-0.2.0-cp311-cp311-win_amd64.whl", hash = "sha256:6e2e54267980349b723cff366d1e29b138b9a60fa376664a157a342689553f71"},
+ {file = "propcache-0.2.0-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:2ee7606193fb267be4b2e3b32714f2d58cad27217638db98a60f9efb5efeccc2"},
+ {file = "propcache-0.2.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:91ee8fc02ca52e24bcb77b234f22afc03288e1dafbb1f88fe24db308910c4ac7"},
+ {file = "propcache-0.2.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:2e900bad2a8456d00a113cad8c13343f3b1f327534e3589acc2219729237a2e8"},
+ {file = "propcache-0.2.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f52a68c21363c45297aca15561812d542f8fc683c85201df0bebe209e349f793"},
+ {file = "propcache-0.2.0-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1e41d67757ff4fbc8ef2af99b338bfb955010444b92929e9e55a6d4dcc3c4f09"},
+ {file = "propcache-0.2.0-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a64e32f8bd94c105cc27f42d3b658902b5bcc947ece3c8fe7bc1b05982f60e89"},
+ {file = "propcache-0.2.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:55346705687dbd7ef0d77883ab4f6fabc48232f587925bdaf95219bae072491e"},
+ {file = "propcache-0.2.0-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:00181262b17e517df2cd85656fcd6b4e70946fe62cd625b9d74ac9977b64d8d9"},
+ {file = "propcache-0.2.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:6994984550eaf25dd7fc7bd1b700ff45c894149341725bb4edc67f0ffa94efa4"},
+ {file = "propcache-0.2.0-cp312-cp312-musllinux_1_2_armv7l.whl", hash = "sha256:56295eb1e5f3aecd516d91b00cfd8bf3a13991de5a479df9e27dd569ea23959c"},
+ {file = "propcache-0.2.0-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:439e76255daa0f8151d3cb325f6dd4a3e93043e6403e6491813bcaaaa8733887"},
+ {file = "propcache-0.2.0-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:f6475a1b2ecb310c98c28d271a30df74f9dd436ee46d09236a6b750a7599ce57"},
+ {file = "propcache-0.2.0-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:3444cdba6628accf384e349014084b1cacd866fbb88433cd9d279d90a54e0b23"},
+ {file = "propcache-0.2.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:4a9d9b4d0a9b38d1c391bb4ad24aa65f306c6f01b512e10a8a34a2dc5675d348"},
+ {file = "propcache-0.2.0-cp312-cp312-win32.whl", hash = "sha256:69d3a98eebae99a420d4b28756c8ce6ea5a29291baf2dc9ff9414b42676f61d5"},
+ {file = "propcache-0.2.0-cp312-cp312-win_amd64.whl", hash = "sha256:ad9c9b99b05f163109466638bd30ada1722abb01bbb85c739c50b6dc11f92dc3"},
+ {file = "propcache-0.2.0-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:ecddc221a077a8132cf7c747d5352a15ed763b674c0448d811f408bf803d9ad7"},
+ {file = "propcache-0.2.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:0e53cb83fdd61cbd67202735e6a6687a7b491c8742dfc39c9e01e80354956763"},
+ {file = "propcache-0.2.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:92fe151145a990c22cbccf9ae15cae8ae9eddabfc949a219c9f667877e40853d"},
+ {file = "propcache-0.2.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d6a21ef516d36909931a2967621eecb256018aeb11fc48656e3257e73e2e247a"},
+ {file = "propcache-0.2.0-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:3f88a4095e913f98988f5b338c1d4d5d07dbb0b6bad19892fd447484e483ba6b"},
+ {file = "propcache-0.2.0-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5a5b3bb545ead161be780ee85a2b54fdf7092815995661947812dde94a40f6fb"},
+ {file = "propcache-0.2.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:67aeb72e0f482709991aa91345a831d0b707d16b0257e8ef88a2ad246a7280bf"},
+ {file = "propcache-0.2.0-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:3c997f8c44ec9b9b0bcbf2d422cc00a1d9b9c681f56efa6ca149a941e5560da2"},
+ {file = "propcache-0.2.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:2a66df3d4992bc1d725b9aa803e8c5a66c010c65c741ad901e260ece77f58d2f"},
+ {file = "propcache-0.2.0-cp313-cp313-musllinux_1_2_armv7l.whl", hash = "sha256:3ebbcf2a07621f29638799828b8d8668c421bfb94c6cb04269130d8de4fb7136"},
+ {file = "propcache-0.2.0-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:1235c01ddaa80da8235741e80815ce381c5267f96cc49b1477fdcf8c047ef325"},
+ {file = "propcache-0.2.0-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:3947483a381259c06921612550867b37d22e1df6d6d7e8361264b6d037595f44"},
+ {file = "propcache-0.2.0-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:d5bed7f9805cc29c780f3aee05de3262ee7ce1f47083cfe9f77471e9d6777e83"},
+ {file = "propcache-0.2.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:e4a91d44379f45f5e540971d41e4626dacd7f01004826a18cb048e7da7e96544"},
+ {file = "propcache-0.2.0-cp313-cp313-win32.whl", hash = "sha256:f902804113e032e2cdf8c71015651c97af6418363bea8d78dc0911d56c335032"},
+ {file = "propcache-0.2.0-cp313-cp313-win_amd64.whl", hash = "sha256:8f188cfcc64fb1266f4684206c9de0e80f54622c3f22a910cbd200478aeae61e"},
+ {file = "propcache-0.2.0-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:53d1bd3f979ed529f0805dd35ddaca330f80a9a6d90bc0121d2ff398f8ed8861"},
+ {file = "propcache-0.2.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:83928404adf8fb3d26793665633ea79b7361efa0287dfbd372a7e74311d51ee6"},
+ {file = "propcache-0.2.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:77a86c261679ea5f3896ec060be9dc8e365788248cc1e049632a1be682442063"},
+ {file = "propcache-0.2.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:218db2a3c297a3768c11a34812e63b3ac1c3234c3a086def9c0fee50d35add1f"},
+ {file = "propcache-0.2.0-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:7735e82e3498c27bcb2d17cb65d62c14f1100b71723b68362872bca7d0913d90"},
+ {file = "propcache-0.2.0-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:20a617c776f520c3875cf4511e0d1db847a076d720714ae35ffe0df3e440be68"},
+ {file = "propcache-0.2.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:67b69535c870670c9f9b14a75d28baa32221d06f6b6fa6f77a0a13c5a7b0a5b9"},
+ {file = "propcache-0.2.0-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:4569158070180c3855e9c0791c56be3ceeb192defa2cdf6a3f39e54319e56b89"},
+ {file = "propcache-0.2.0-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:db47514ffdbd91ccdc7e6f8407aac4ee94cc871b15b577c1c324236b013ddd04"},
+ {file = "propcache-0.2.0-cp38-cp38-musllinux_1_2_armv7l.whl", hash = "sha256:2a60ad3e2553a74168d275a0ef35e8c0a965448ffbc3b300ab3a5bb9956c2162"},
+ {file = "propcache-0.2.0-cp38-cp38-musllinux_1_2_i686.whl", hash = "sha256:662dd62358bdeaca0aee5761de8727cfd6861432e3bb828dc2a693aa0471a563"},
+ {file = "propcache-0.2.0-cp38-cp38-musllinux_1_2_ppc64le.whl", hash = "sha256:25a1f88b471b3bc911d18b935ecb7115dff3a192b6fef46f0bfaf71ff4f12418"},
+ {file = "propcache-0.2.0-cp38-cp38-musllinux_1_2_s390x.whl", hash = "sha256:f60f0ac7005b9f5a6091009b09a419ace1610e163fa5deaba5ce3484341840e7"},
+ {file = "propcache-0.2.0-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:74acd6e291f885678631b7ebc85d2d4aec458dd849b8c841b57ef04047833bed"},
+ {file = "propcache-0.2.0-cp38-cp38-win32.whl", hash = "sha256:d9b6ddac6408194e934002a69bcaadbc88c10b5f38fb9307779d1c629181815d"},
+ {file = "propcache-0.2.0-cp38-cp38-win_amd64.whl", hash = "sha256:676135dcf3262c9c5081cc8f19ad55c8a64e3f7282a21266d05544450bffc3a5"},
+ {file = "propcache-0.2.0-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:25c8d773a62ce0451b020c7b29a35cfbc05de8b291163a7a0f3b7904f27253e6"},
+ {file = "propcache-0.2.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:375a12d7556d462dc64d70475a9ee5982465fbb3d2b364f16b86ba9135793638"},
+ {file = "propcache-0.2.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:1ec43d76b9677637a89d6ab86e1fef70d739217fefa208c65352ecf0282be957"},
+ {file = "propcache-0.2.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f45eec587dafd4b2d41ac189c2156461ebd0c1082d2fe7013571598abb8505d1"},
+ {file = "propcache-0.2.0-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:bc092ba439d91df90aea38168e11f75c655880c12782facf5cf9c00f3d42b562"},
+ {file = "propcache-0.2.0-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:fa1076244f54bb76e65e22cb6910365779d5c3d71d1f18b275f1dfc7b0d71b4d"},
+ {file = "propcache-0.2.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:682a7c79a2fbf40f5dbb1eb6bfe2cd865376deeac65acf9beb607505dced9e12"},
+ {file = "propcache-0.2.0-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:8e40876731f99b6f3c897b66b803c9e1c07a989b366c6b5b475fafd1f7ba3fb8"},
+ {file = "propcache-0.2.0-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:363ea8cd3c5cb6679f1c2f5f1f9669587361c062e4899fce56758efa928728f8"},
+ {file = "propcache-0.2.0-cp39-cp39-musllinux_1_2_armv7l.whl", hash = "sha256:140fbf08ab3588b3468932974a9331aff43c0ab8a2ec2c608b6d7d1756dbb6cb"},
+ {file = "propcache-0.2.0-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:e70fac33e8b4ac63dfc4c956fd7d85a0b1139adcfc0d964ce288b7c527537fea"},
+ {file = "propcache-0.2.0-cp39-cp39-musllinux_1_2_ppc64le.whl", hash = "sha256:b33d7a286c0dc1a15f5fc864cc48ae92a846df287ceac2dd499926c3801054a6"},
+ {file = "propcache-0.2.0-cp39-cp39-musllinux_1_2_s390x.whl", hash = "sha256:f6d5749fdd33d90e34c2efb174c7e236829147a2713334d708746e94c4bde40d"},
+ {file = "propcache-0.2.0-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:22aa8f2272d81d9317ff5756bb108021a056805ce63dd3630e27d042c8092798"},
+ {file = "propcache-0.2.0-cp39-cp39-win32.whl", hash = "sha256:73e4b40ea0eda421b115248d7e79b59214411109a5bc47d0d48e4c73e3b8fcf9"},
+ {file = "propcache-0.2.0-cp39-cp39-win_amd64.whl", hash = "sha256:9517d5e9e0731957468c29dbfd0f976736a0e55afaea843726e887f36fe017df"},
+ {file = "propcache-0.2.0-py3-none-any.whl", hash = "sha256:2ccc28197af5313706511fab3a8b66dcd6da067a1331372c82ea1cb74285e036"},
+ {file = "propcache-0.2.0.tar.gz", hash = "sha256:df81779732feb9d01e5d513fad0122efb3d53bbc75f61b2a4f29a020bc985e70"},
+]
+
[[package]]
name = "proto-plus"
version = "1.22.3"
@@ -6754,52 +6961,55 @@ files = [
[[package]]
name = "pyarrow"
-version = "16.1.0"
+version = "17.0.0"
description = "Python library for Apache Arrow"
optional = false
python-versions = ">=3.8"
files = [
- {file = "pyarrow-16.1.0-cp310-cp310-macosx_10_15_x86_64.whl", hash = "sha256:17e23b9a65a70cc733d8b738baa6ad3722298fa0c81d88f63ff94bf25eaa77b9"},
- {file = "pyarrow-16.1.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:4740cc41e2ba5d641071d0ab5e9ef9b5e6e8c7611351a5cb7c1d175eaf43674a"},
- {file = "pyarrow-16.1.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:98100e0268d04e0eec47b73f20b39c45b4006f3c4233719c3848aa27a03c1aef"},
- {file = "pyarrow-16.1.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f68f409e7b283c085f2da014f9ef81e885d90dcd733bd648cfba3ef265961848"},
- {file = "pyarrow-16.1.0-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:a8914cd176f448e09746037b0c6b3a9d7688cef451ec5735094055116857580c"},
- {file = "pyarrow-16.1.0-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:48be160782c0556156d91adbdd5a4a7e719f8d407cb46ae3bb4eaee09b3111bd"},
- {file = "pyarrow-16.1.0-cp310-cp310-win_amd64.whl", hash = "sha256:9cf389d444b0f41d9fe1444b70650fea31e9d52cfcb5f818b7888b91b586efff"},
- {file = "pyarrow-16.1.0-cp311-cp311-macosx_10_15_x86_64.whl", hash = "sha256:d0ebea336b535b37eee9eee31761813086d33ed06de9ab6fc6aaa0bace7b250c"},
- {file = "pyarrow-16.1.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:2e73cfc4a99e796727919c5541c65bb88b973377501e39b9842ea71401ca6c1c"},
- {file = "pyarrow-16.1.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bf9251264247ecfe93e5f5a0cd43b8ae834f1e61d1abca22da55b20c788417f6"},
- {file = "pyarrow-16.1.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ddf5aace92d520d3d2a20031d8b0ec27b4395cab9f74e07cc95edf42a5cc0147"},
- {file = "pyarrow-16.1.0-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:25233642583bf658f629eb230b9bb79d9af4d9f9229890b3c878699c82f7d11e"},
- {file = "pyarrow-16.1.0-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:a33a64576fddfbec0a44112eaf844c20853647ca833e9a647bfae0582b2ff94b"},
- {file = "pyarrow-16.1.0-cp311-cp311-win_amd64.whl", hash = "sha256:185d121b50836379fe012753cf15c4ba9638bda9645183ab36246923875f8d1b"},
- {file = "pyarrow-16.1.0-cp312-cp312-macosx_10_15_x86_64.whl", hash = "sha256:2e51ca1d6ed7f2e9d5c3c83decf27b0d17bb207a7dea986e8dc3e24f80ff7d6f"},
- {file = "pyarrow-16.1.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:06ebccb6f8cb7357de85f60d5da50e83507954af617d7b05f48af1621d331c9a"},
- {file = "pyarrow-16.1.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b04707f1979815f5e49824ce52d1dceb46e2f12909a48a6a753fe7cafbc44a0c"},
- {file = "pyarrow-16.1.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0d32000693deff8dc5df444b032b5985a48592c0697cb6e3071a5d59888714e2"},
- {file = "pyarrow-16.1.0-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:8785bb10d5d6fd5e15d718ee1d1f914fe768bf8b4d1e5e9bf253de8a26cb1628"},
- {file = "pyarrow-16.1.0-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:e1369af39587b794873b8a307cc6623a3b1194e69399af0efd05bb202195a5a7"},
- {file = "pyarrow-16.1.0-cp312-cp312-win_amd64.whl", hash = "sha256:febde33305f1498f6df85e8020bca496d0e9ebf2093bab9e0f65e2b4ae2b3444"},
- {file = "pyarrow-16.1.0-cp38-cp38-macosx_10_15_x86_64.whl", hash = "sha256:b5f5705ab977947a43ac83b52ade3b881eb6e95fcc02d76f501d549a210ba77f"},
- {file = "pyarrow-16.1.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:0d27bf89dfc2576f6206e9cd6cf7a107c9c06dc13d53bbc25b0bd4556f19cf5f"},
- {file = "pyarrow-16.1.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0d07de3ee730647a600037bc1d7b7994067ed64d0eba797ac74b2bc77384f4c2"},
- {file = "pyarrow-16.1.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fbef391b63f708e103df99fbaa3acf9f671d77a183a07546ba2f2c297b361e83"},
- {file = "pyarrow-16.1.0-cp38-cp38-manylinux_2_28_aarch64.whl", hash = "sha256:19741c4dbbbc986d38856ee7ddfdd6a00fc3b0fc2d928795b95410d38bb97d15"},
- {file = "pyarrow-16.1.0-cp38-cp38-manylinux_2_28_x86_64.whl", hash = "sha256:f2c5fb249caa17b94e2b9278b36a05ce03d3180e6da0c4c3b3ce5b2788f30eed"},
- {file = "pyarrow-16.1.0-cp38-cp38-win_amd64.whl", hash = "sha256:e6b6d3cd35fbb93b70ade1336022cc1147b95ec6af7d36906ca7fe432eb09710"},
- {file = "pyarrow-16.1.0-cp39-cp39-macosx_10_15_x86_64.whl", hash = "sha256:18da9b76a36a954665ccca8aa6bd9f46c1145f79c0bb8f4f244f5f8e799bca55"},
- {file = "pyarrow-16.1.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:99f7549779b6e434467d2aa43ab2b7224dd9e41bdde486020bae198978c9e05e"},
- {file = "pyarrow-16.1.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f07fdffe4fd5b15f5ec15c8b64584868d063bc22b86b46c9695624ca3505b7b4"},
- {file = "pyarrow-16.1.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ddfe389a08ea374972bd4065d5f25d14e36b43ebc22fc75f7b951f24378bf0b5"},
- {file = "pyarrow-16.1.0-cp39-cp39-manylinux_2_28_aarch64.whl", hash = "sha256:3b20bd67c94b3a2ea0a749d2a5712fc845a69cb5d52e78e6449bbd295611f3aa"},
- {file = "pyarrow-16.1.0-cp39-cp39-manylinux_2_28_x86_64.whl", hash = "sha256:ba8ac20693c0bb0bf4b238751d4409e62852004a8cf031c73b0e0962b03e45e3"},
- {file = "pyarrow-16.1.0-cp39-cp39-win_amd64.whl", hash = "sha256:31a1851751433d89a986616015841977e0a188662fcffd1a5677453f1df2de0a"},
- {file = "pyarrow-16.1.0.tar.gz", hash = "sha256:15fbb22ea96d11f0b5768504a3f961edab25eaf4197c341720c4a387f6c60315"},
+ {file = "pyarrow-17.0.0-cp310-cp310-macosx_10_15_x86_64.whl", hash = "sha256:a5c8b238d47e48812ee577ee20c9a2779e6a5904f1708ae240f53ecbee7c9f07"},
+ {file = "pyarrow-17.0.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:db023dc4c6cae1015de9e198d41250688383c3f9af8f565370ab2b4cb5f62655"},
+ {file = "pyarrow-17.0.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:da1e060b3876faa11cee287839f9cc7cdc00649f475714b8680a05fd9071d545"},
+ {file = "pyarrow-17.0.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:75c06d4624c0ad6674364bb46ef38c3132768139ddec1c56582dbac54f2663e2"},
+ {file = "pyarrow-17.0.0-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:fa3c246cc58cb5a4a5cb407a18f193354ea47dd0648194e6265bd24177982fe8"},
+ {file = "pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:f7ae2de664e0b158d1607699a16a488de3d008ba99b3a7aa5de1cbc13574d047"},
+ {file = "pyarrow-17.0.0-cp310-cp310-win_amd64.whl", hash = "sha256:5984f416552eea15fd9cee03da53542bf4cddaef5afecefb9aa8d1010c335087"},
+ {file = "pyarrow-17.0.0-cp311-cp311-macosx_10_15_x86_64.whl", hash = "sha256:1c8856e2ef09eb87ecf937104aacfa0708f22dfeb039c363ec99735190ffb977"},
+ {file = "pyarrow-17.0.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:2e19f569567efcbbd42084e87f948778eb371d308e137a0f97afe19bb860ccb3"},
+ {file = "pyarrow-17.0.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6b244dc8e08a23b3e352899a006a26ae7b4d0da7bb636872fa8f5884e70acf15"},
+ {file = "pyarrow-17.0.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0b72e87fe3e1db343995562f7fff8aee354b55ee83d13afba65400c178ab2597"},
+ {file = "pyarrow-17.0.0-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:dc5c31c37409dfbc5d014047817cb4ccd8c1ea25d19576acf1a001fe07f5b420"},
+ {file = "pyarrow-17.0.0-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:e3343cb1e88bc2ea605986d4b94948716edc7a8d14afd4e2c097232f729758b4"},
+ {file = "pyarrow-17.0.0-cp311-cp311-win_amd64.whl", hash = "sha256:a27532c38f3de9eb3e90ecab63dfda948a8ca859a66e3a47f5f42d1e403c4d03"},
+ {file = "pyarrow-17.0.0-cp312-cp312-macosx_10_15_x86_64.whl", hash = "sha256:9b8a823cea605221e61f34859dcc03207e52e409ccf6354634143e23af7c8d22"},
+ {file = "pyarrow-17.0.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:f1e70de6cb5790a50b01d2b686d54aaf73da01266850b05e3af2a1bc89e16053"},
+ {file = "pyarrow-17.0.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0071ce35788c6f9077ff9ecba4858108eebe2ea5a3f7cf2cf55ebc1dbc6ee24a"},
+ {file = "pyarrow-17.0.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:757074882f844411fcca735e39aae74248a1531367a7c80799b4266390ae51cc"},
+ {file = "pyarrow-17.0.0-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:9ba11c4f16976e89146781a83833df7f82077cdab7dc6232c897789343f7891a"},
+ {file = "pyarrow-17.0.0-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:b0c6ac301093b42d34410b187bba560b17c0330f64907bfa4f7f7f2444b0cf9b"},
+ {file = "pyarrow-17.0.0-cp312-cp312-win_amd64.whl", hash = "sha256:392bc9feabc647338e6c89267635e111d71edad5fcffba204425a7c8d13610d7"},
+ {file = "pyarrow-17.0.0-cp38-cp38-macosx_10_15_x86_64.whl", hash = "sha256:af5ff82a04b2171415f1410cff7ebb79861afc5dae50be73ce06d6e870615204"},
+ {file = "pyarrow-17.0.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:edca18eaca89cd6382dfbcff3dd2d87633433043650c07375d095cd3517561d8"},
+ {file = "pyarrow-17.0.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7c7916bff914ac5d4a8fe25b7a25e432ff921e72f6f2b7547d1e325c1ad9d155"},
+ {file = "pyarrow-17.0.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f553ca691b9e94b202ff741bdd40f6ccb70cdd5fbf65c187af132f1317de6145"},
+ {file = "pyarrow-17.0.0-cp38-cp38-manylinux_2_28_aarch64.whl", hash = "sha256:0cdb0e627c86c373205a2f94a510ac4376fdc523f8bb36beab2e7f204416163c"},
+ {file = "pyarrow-17.0.0-cp38-cp38-manylinux_2_28_x86_64.whl", hash = "sha256:d7d192305d9d8bc9082d10f361fc70a73590a4c65cf31c3e6926cd72b76bc35c"},
+ {file = "pyarrow-17.0.0-cp38-cp38-win_amd64.whl", hash = "sha256:02dae06ce212d8b3244dd3e7d12d9c4d3046945a5933d28026598e9dbbda1fca"},
+ {file = "pyarrow-17.0.0-cp39-cp39-macosx_10_15_x86_64.whl", hash = "sha256:13d7a460b412f31e4c0efa1148e1d29bdf18ad1411eb6757d38f8fbdcc8645fb"},
+ {file = "pyarrow-17.0.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:9b564a51fbccfab5a04a80453e5ac6c9954a9c5ef2890d1bcf63741909c3f8df"},
+ {file = "pyarrow-17.0.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:32503827abbc5aadedfa235f5ece8c4f8f8b0a3cf01066bc8d29de7539532687"},
+ {file = "pyarrow-17.0.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a155acc7f154b9ffcc85497509bcd0d43efb80d6f733b0dc3bb14e281f131c8b"},
+ {file = "pyarrow-17.0.0-cp39-cp39-manylinux_2_28_aarch64.whl", hash = "sha256:dec8d129254d0188a49f8a1fc99e0560dc1b85f60af729f47de4046015f9b0a5"},
+ {file = "pyarrow-17.0.0-cp39-cp39-manylinux_2_28_x86_64.whl", hash = "sha256:a48ddf5c3c6a6c505904545c25a4ae13646ae1f8ba703c4df4a1bfe4f4006bda"},
+ {file = "pyarrow-17.0.0-cp39-cp39-win_amd64.whl", hash = "sha256:42bf93249a083aca230ba7e2786c5f673507fa97bbd9725a1e2754715151a204"},
+ {file = "pyarrow-17.0.0.tar.gz", hash = "sha256:4beca9521ed2c0921c1023e68d097d0299b62c362639ea315572a58f3f50fd28"},
]
[package.dependencies]
numpy = ">=1.16.6"
+[package.extras]
+test = ["cffi", "hypothesis", "pandas", "pytest", "pytz"]
+
[[package]]
name = "pyasn1"
version = "0.5.0"
@@ -8355,6 +8565,35 @@ docs = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "pygments
testing = ["build[virtualenv]", "filelock (>=3.4.0)", "flake8-2020", "ini2toml[lite] (>=0.9)", "jaraco.develop (>=7.21)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "pip (>=19.1)", "pytest (>=6)", "pytest-black (>=0.3.7)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=2.2)", "pytest-mypy (>=0.9.1)", "pytest-perf", "pytest-ruff", "pytest-timeout", "pytest-xdist", "tomli-w (>=1.0.0)", "virtualenv (>=13.0.0)", "wheel"]
testing-integration = ["build[virtualenv]", "filelock (>=3.4.0)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "pytest", "pytest-enabler", "pytest-xdist", "tomli", "virtualenv (>=13.0.0)", "wheel"]
+[[package]]
+name = "shellingham"
+version = "1.5.4"
+description = "Tool to Detect Surrounding Shell"
+optional = false
+python-versions = ">=3.7"
+files = [
+ {file = "shellingham-1.5.4-py2.py3-none-any.whl", hash = "sha256:7ecfff8f2fd72616f7481040475a65b2bf8af90a56c89140852d1120324e8686"},
+ {file = "shellingham-1.5.4.tar.gz", hash = "sha256:8dbca0739d487e5bd35ab3ca4b36e11c4078f3a234bfce294b0a0291363404de"},
+]
+
+[[package]]
+name = "sigtools"
+version = "4.0.1"
+description = "Utilities for working with inspect.Signature objects."
+optional = false
+python-versions = ">=3.6"
+files = [
+ {file = "sigtools-4.0.1-py2.py3-none-any.whl", hash = "sha256:d216b4cf920bbab0fce636ddc429ed8463a5b533d9e1492acb45a2a1bc36ac6c"},
+ {file = "sigtools-4.0.1.tar.gz", hash = "sha256:4b8e135a9cd4d2ea00da670c093372d74e672ba3abb87f4c98d8e73dea54445c"},
+]
+
+[package.dependencies]
+attrs = "*"
+
+[package.extras]
+test = ["coverage", "mock", "repeated-test (>=2.2.1)", "sphinx"]
+tests = ["coverage", "mock", "repeated-test (>=2.2.1)", "sphinx"]
+
[[package]]
name = "simplejson"
version = "3.19.1"
@@ -8741,6 +8980,24 @@ dev = ["build", "flake8"]
doc = ["sphinx"]
test = ["pytest", "pytest-cov"]
+[[package]]
+name = "starlette"
+version = "0.41.2"
+description = "The little ASGI library that shines."
+optional = false
+python-versions = ">=3.8"
+files = [
+ {file = "starlette-0.41.2-py3-none-any.whl", hash = "sha256:fbc189474b4731cf30fcef52f18a8d070e3f3b46c6a04c97579e85e6ffca942d"},
+ {file = "starlette-0.41.2.tar.gz", hash = "sha256:9834fd799d1a87fd346deb76158668cfa0b0d56f85caefe8268e2d97c3468b62"},
+]
+
+[package.dependencies]
+anyio = ">=3.4.0,<5"
+typing-extensions = {version = ">=3.10.0", markers = "python_version < \"3.10\""}
+
+[package.extras]
+full = ["httpx (>=0.22.0)", "itsdangerous", "jinja2", "python-multipart (>=0.0.7)", "pyyaml"]
+
[[package]]
name = "stevedore"
version = "5.1.0"
@@ -8769,6 +9026,21 @@ files = [
[package.dependencies]
mpmath = ">=0.19"
+[[package]]
+name = "synchronicity"
+version = "0.9.3"
+description = "Export blocking and async library versions from a single async implementation"
+optional = false
+python-versions = ">=3.8"
+files = [
+ {file = "synchronicity-0.9.3-py3-none-any.whl", hash = "sha256:73c06fe6613c698cbcfa6e77ab6b8d49cce3494c5afc3ef23b007b1fdff2256d"},
+ {file = "synchronicity-0.9.3.tar.gz", hash = "sha256:d3856601e63e518a143ec42f57988d9e88e4f94716168b717fd4b1b64f4704fd"},
+]
+
+[package.dependencies]
+sigtools = ">=4.0.1"
+typing-extensions = ">=4.12.2"
+
[[package]]
name = "tabulate"
version = "0.9.0"
@@ -9094,6 +9366,23 @@ files = [
[package.dependencies]
typing-extensions = ">=3.0.0"
+[[package]]
+name = "typer"
+version = "0.12.5"
+description = "Typer, build great CLIs. Easy to code. Based on Python type hints."
+optional = false
+python-versions = ">=3.7"
+files = [
+ {file = "typer-0.12.5-py3-none-any.whl", hash = "sha256:62fe4e471711b147e3365034133904df3e235698399bc4de2b36c8579298d52b"},
+ {file = "typer-0.12.5.tar.gz", hash = "sha256:f592f089bedcc8ec1b974125d64851029c3b1af145f04aca64d69410f0c9b722"},
+]
+
+[package.dependencies]
+click = ">=8.0.0"
+rich = ">=10.11.0"
+shellingham = ">=1.3.0"
+typing-extensions = ">=3.7.4.3"
+
[[package]]
name = "types-awscrt"
version = "0.19.1"
@@ -9116,6 +9405,17 @@ files = [
{file = "types_cachetools-5.3.0.6-py3-none-any.whl", hash = "sha256:f7f8a25bfe306f2e6bc2ad0a2f949d9e72f2d91036d509c36d3810bf728bc6e1"},
]
+[[package]]
+name = "types-certifi"
+version = "2021.10.8.3"
+description = "Typing stubs for certifi"
+optional = false
+python-versions = "*"
+files = [
+ {file = "types-certifi-2021.10.8.3.tar.gz", hash = "sha256:72cf7798d165bc0b76e1c10dd1ea3097c7063c42c21d664523b928e88b554a4f"},
+ {file = "types_certifi-2021.10.8.3-py3-none-any.whl", hash = "sha256:b2d1e325e69f71f7c78e5943d410e650b4707bb0ef32e4ddf3da37f54176e88a"},
+]
+
[[package]]
name = "types-click"
version = "7.1.8"
@@ -9273,6 +9573,17 @@ files = [
{file = "types_SQLAlchemy-1.4.53.38-py3-none-any.whl", hash = "sha256:7e60e74f823931cc9a9e8adb0a4c05e5533e6708b8a266807893a739faf4eaaa"},
]
+[[package]]
+name = "types-toml"
+version = "0.10.8.20240310"
+description = "Typing stubs for toml"
+optional = false
+python-versions = ">=3.8"
+files = [
+ {file = "types-toml-0.10.8.20240310.tar.gz", hash = "sha256:3d41501302972436a6b8b239c850b26689657e25281b48ff0ec06345b8830331"},
+ {file = "types_toml-0.10.8.20240310-py3-none-any.whl", hash = "sha256:627b47775d25fa29977d9c70dc0cbab3f314f32c8d8d0c012f2ef5de7aaec05d"},
+]
+
[[package]]
name = "types-tqdm"
version = "4.66.0.2"
@@ -9454,6 +9765,101 @@ files = [
[package.extras]
watchmedo = ["PyYAML (>=3.10)"]
+[[package]]
+name = "watchfiles"
+version = "0.24.0"
+description = "Simple, modern and high performance file watching and code reload in python."
+optional = false
+python-versions = ">=3.8"
+files = [
+ {file = "watchfiles-0.24.0-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:083dc77dbdeef09fa44bb0f4d1df571d2e12d8a8f985dccde71ac3ac9ac067a0"},
+ {file = "watchfiles-0.24.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:e94e98c7cb94cfa6e071d401ea3342767f28eb5a06a58fafdc0d2a4974f4f35c"},
+ {file = "watchfiles-0.24.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:82ae557a8c037c42a6ef26c494d0631cacca040934b101d001100ed93d43f361"},
+ {file = "watchfiles-0.24.0-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:acbfa31e315a8f14fe33e3542cbcafc55703b8f5dcbb7c1eecd30f141df50db3"},
+ {file = "watchfiles-0.24.0-cp310-cp310-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b74fdffce9dfcf2dc296dec8743e5b0332d15df19ae464f0e249aa871fc1c571"},
+ {file = "watchfiles-0.24.0-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:449f43f49c8ddca87c6b3980c9284cab6bd1f5c9d9a2b00012adaaccd5e7decd"},
+ {file = "watchfiles-0.24.0-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:4abf4ad269856618f82dee296ac66b0cd1d71450fc3c98532d93798e73399b7a"},
+ {file = "watchfiles-0.24.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9f895d785eb6164678ff4bb5cc60c5996b3ee6df3edb28dcdeba86a13ea0465e"},
+ {file = "watchfiles-0.24.0-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:7ae3e208b31be8ce7f4c2c0034f33406dd24fbce3467f77223d10cd86778471c"},
+ {file = "watchfiles-0.24.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:2efec17819b0046dde35d13fb8ac7a3ad877af41ae4640f4109d9154ed30a188"},
+ {file = "watchfiles-0.24.0-cp310-none-win32.whl", hash = "sha256:6bdcfa3cd6fdbdd1a068a52820f46a815401cbc2cb187dd006cb076675e7b735"},
+ {file = "watchfiles-0.24.0-cp310-none-win_amd64.whl", hash = "sha256:54ca90a9ae6597ae6dc00e7ed0a040ef723f84ec517d3e7ce13e63e4bc82fa04"},
+ {file = "watchfiles-0.24.0-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:bdcd5538e27f188dd3c804b4a8d5f52a7fc7f87e7fd6b374b8e36a4ca03db428"},
+ {file = "watchfiles-0.24.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:2dadf8a8014fde6addfd3c379e6ed1a981c8f0a48292d662e27cabfe4239c83c"},
+ {file = "watchfiles-0.24.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6509ed3f467b79d95fc62a98229f79b1a60d1b93f101e1c61d10c95a46a84f43"},
+ {file = "watchfiles-0.24.0-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:8360f7314a070c30e4c976b183d1d8d1585a4a50c5cb603f431cebcbb4f66327"},
+ {file = "watchfiles-0.24.0-cp311-cp311-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:316449aefacf40147a9efaf3bd7c9bdd35aaba9ac5d708bd1eb5763c9a02bef5"},
+ {file = "watchfiles-0.24.0-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:73bde715f940bea845a95247ea3e5eb17769ba1010efdc938ffcb967c634fa61"},
+ {file = "watchfiles-0.24.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:3770e260b18e7f4e576edca4c0a639f704088602e0bc921c5c2e721e3acb8d15"},
+ {file = "watchfiles-0.24.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:aa0fd7248cf533c259e59dc593a60973a73e881162b1a2f73360547132742823"},
+ {file = "watchfiles-0.24.0-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:d7a2e3b7f5703ffbd500dabdefcbc9eafeff4b9444bbdd5d83d79eedf8428fab"},
+ {file = "watchfiles-0.24.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:d831ee0a50946d24a53821819b2327d5751b0c938b12c0653ea5be7dea9c82ec"},
+ {file = "watchfiles-0.24.0-cp311-none-win32.whl", hash = "sha256:49d617df841a63b4445790a254013aea2120357ccacbed00253f9c2b5dc24e2d"},
+ {file = "watchfiles-0.24.0-cp311-none-win_amd64.whl", hash = "sha256:d3dcb774e3568477275cc76554b5a565024b8ba3a0322f77c246bc7111c5bb9c"},
+ {file = "watchfiles-0.24.0-cp311-none-win_arm64.whl", hash = "sha256:9301c689051a4857d5b10777da23fafb8e8e921bcf3abe6448a058d27fb67633"},
+ {file = "watchfiles-0.24.0-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:7211b463695d1e995ca3feb38b69227e46dbd03947172585ecb0588f19b0d87a"},
+ {file = "watchfiles-0.24.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:4b8693502d1967b00f2fb82fc1e744df128ba22f530e15b763c8d82baee15370"},
+ {file = "watchfiles-0.24.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:cdab9555053399318b953a1fe1f586e945bc8d635ce9d05e617fd9fe3a4687d6"},
+ {file = "watchfiles-0.24.0-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:34e19e56d68b0dad5cff62273107cf5d9fbaf9d75c46277aa5d803b3ef8a9e9b"},
+ {file = "watchfiles-0.24.0-cp312-cp312-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:41face41f036fee09eba33a5b53a73e9a43d5cb2c53dad8e61fa6c9f91b5a51e"},
+ {file = "watchfiles-0.24.0-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:5148c2f1ea043db13ce9b0c28456e18ecc8f14f41325aa624314095b6aa2e9ea"},
+ {file = "watchfiles-0.24.0-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:7e4bd963a935aaf40b625c2499f3f4f6bbd0c3776f6d3bc7c853d04824ff1c9f"},
+ {file = "watchfiles-0.24.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c79d7719d027b7a42817c5d96461a99b6a49979c143839fc37aa5748c322f234"},
+ {file = "watchfiles-0.24.0-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:32aa53a9a63b7f01ed32e316e354e81e9da0e6267435c7243bf8ae0f10b428ef"},
+ {file = "watchfiles-0.24.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:ce72dba6a20e39a0c628258b5c308779b8697f7676c254a845715e2a1039b968"},
+ {file = "watchfiles-0.24.0-cp312-none-win32.whl", hash = "sha256:d9018153cf57fc302a2a34cb7564870b859ed9a732d16b41a9b5cb2ebed2d444"},
+ {file = "watchfiles-0.24.0-cp312-none-win_amd64.whl", hash = "sha256:551ec3ee2a3ac9cbcf48a4ec76e42c2ef938a7e905a35b42a1267fa4b1645896"},
+ {file = "watchfiles-0.24.0-cp312-none-win_arm64.whl", hash = "sha256:b52a65e4ea43c6d149c5f8ddb0bef8d4a1e779b77591a458a893eb416624a418"},
+ {file = "watchfiles-0.24.0-cp313-cp313-macosx_10_12_x86_64.whl", hash = "sha256:3d2e3ab79a1771c530233cadfd277fcc762656d50836c77abb2e5e72b88e3a48"},
+ {file = "watchfiles-0.24.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:327763da824817b38ad125dcd97595f942d720d32d879f6c4ddf843e3da3fe90"},
+ {file = "watchfiles-0.24.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bd82010f8ab451dabe36054a1622870166a67cf3fce894f68895db6f74bbdc94"},
+ {file = "watchfiles-0.24.0-cp313-cp313-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:d64ba08db72e5dfd5c33be1e1e687d5e4fcce09219e8aee893a4862034081d4e"},
+ {file = "watchfiles-0.24.0-cp313-cp313-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1cf1f6dd7825053f3d98f6d33f6464ebdd9ee95acd74ba2c34e183086900a827"},
+ {file = "watchfiles-0.24.0-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:43e3e37c15a8b6fe00c1bce2473cfa8eb3484bbeecf3aefbf259227e487a03df"},
+ {file = "watchfiles-0.24.0-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:88bcd4d0fe1d8ff43675360a72def210ebad3f3f72cabfeac08d825d2639b4ab"},
+ {file = "watchfiles-0.24.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:999928c6434372fde16c8f27143d3e97201160b48a614071261701615a2a156f"},
+ {file = "watchfiles-0.24.0-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:30bbd525c3262fd9f4b1865cb8d88e21161366561cd7c9e1194819e0a33ea86b"},
+ {file = "watchfiles-0.24.0-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:edf71b01dec9f766fb285b73930f95f730bb0943500ba0566ae234b5c1618c18"},
+ {file = "watchfiles-0.24.0-cp313-none-win32.whl", hash = "sha256:f4c96283fca3ee09fb044f02156d9570d156698bc3734252175a38f0e8975f07"},
+ {file = "watchfiles-0.24.0-cp313-none-win_amd64.whl", hash = "sha256:a974231b4fdd1bb7f62064a0565a6b107d27d21d9acb50c484d2cdba515b9366"},
+ {file = "watchfiles-0.24.0-cp38-cp38-macosx_10_12_x86_64.whl", hash = "sha256:ee82c98bed9d97cd2f53bdb035e619309a098ea53ce525833e26b93f673bc318"},
+ {file = "watchfiles-0.24.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:fd92bbaa2ecdb7864b7600dcdb6f2f1db6e0346ed425fbd01085be04c63f0b05"},
+ {file = "watchfiles-0.24.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f83df90191d67af5a831da3a33dd7628b02a95450e168785586ed51e6d28943c"},
+ {file = "watchfiles-0.24.0-cp38-cp38-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:fca9433a45f18b7c779d2bae7beeec4f740d28b788b117a48368d95a3233ed83"},
+ {file = "watchfiles-0.24.0-cp38-cp38-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b995bfa6bf01a9e09b884077a6d37070464b529d8682d7691c2d3b540d357a0c"},
+ {file = "watchfiles-0.24.0-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ed9aba6e01ff6f2e8285e5aa4154e2970068fe0fc0998c4380d0e6278222269b"},
+ {file = "watchfiles-0.24.0-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:e5171ef898299c657685306d8e1478a45e9303ddcd8ac5fed5bd52ad4ae0b69b"},
+ {file = "watchfiles-0.24.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4933a508d2f78099162da473841c652ad0de892719043d3f07cc83b33dfd9d91"},
+ {file = "watchfiles-0.24.0-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:95cf3b95ea665ab03f5a54765fa41abf0529dbaf372c3b83d91ad2cfa695779b"},
+ {file = "watchfiles-0.24.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:01def80eb62bd5db99a798d5e1f5f940ca0a05986dcfae21d833af7a46f7ee22"},
+ {file = "watchfiles-0.24.0-cp38-none-win32.whl", hash = "sha256:4d28cea3c976499475f5b7a2fec6b3a36208656963c1a856d328aeae056fc5c1"},
+ {file = "watchfiles-0.24.0-cp38-none-win_amd64.whl", hash = "sha256:21ab23fdc1208086d99ad3f69c231ba265628014d4aed31d4e8746bd59e88cd1"},
+ {file = "watchfiles-0.24.0-cp39-cp39-macosx_10_12_x86_64.whl", hash = "sha256:b665caeeda58625c3946ad7308fbd88a086ee51ccb706307e5b1fa91556ac886"},
+ {file = "watchfiles-0.24.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:5c51749f3e4e269231510da426ce4a44beb98db2dce9097225c338f815b05d4f"},
+ {file = "watchfiles-0.24.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:82b2509f08761f29a0fdad35f7e1638b8ab1adfa2666d41b794090361fb8b855"},
+ {file = "watchfiles-0.24.0-cp39-cp39-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:9a60e2bf9dc6afe7f743e7c9b149d1fdd6dbf35153c78fe3a14ae1a9aee3d98b"},
+ {file = "watchfiles-0.24.0-cp39-cp39-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f7d9b87c4c55e3ea8881dfcbf6d61ea6775fffed1fedffaa60bd047d3c08c430"},
+ {file = "watchfiles-0.24.0-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:78470906a6be5199524641f538bd2c56bb809cd4bf29a566a75051610bc982c3"},
+ {file = "watchfiles-0.24.0-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:07cdef0c84c03375f4e24642ef8d8178e533596b229d32d2bbd69e5128ede02a"},
+ {file = "watchfiles-0.24.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d337193bbf3e45171c8025e291530fb7548a93c45253897cd764a6a71c937ed9"},
+ {file = "watchfiles-0.24.0-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:ec39698c45b11d9694a1b635a70946a5bad066b593af863460a8e600f0dff1ca"},
+ {file = "watchfiles-0.24.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:2e28d91ef48eab0afb939fa446d8ebe77e2f7593f5f463fd2bb2b14132f95b6e"},
+ {file = "watchfiles-0.24.0-cp39-none-win32.whl", hash = "sha256:7138eff8baa883aeaa074359daabb8b6c1e73ffe69d5accdc907d62e50b1c0da"},
+ {file = "watchfiles-0.24.0-cp39-none-win_amd64.whl", hash = "sha256:b3ef2c69c655db63deb96b3c3e587084612f9b1fa983df5e0c3379d41307467f"},
+ {file = "watchfiles-0.24.0-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:632676574429bee8c26be8af52af20e0c718cc7f5f67f3fb658c71928ccd4f7f"},
+ {file = "watchfiles-0.24.0-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:a2a9891723a735d3e2540651184be6fd5b96880c08ffe1a98bae5017e65b544b"},
+ {file = "watchfiles-0.24.0-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4a7fa2bc0efef3e209a8199fd111b8969fe9db9c711acc46636686331eda7dd4"},
+ {file = "watchfiles-0.24.0-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:01550ccf1d0aed6ea375ef259706af76ad009ef5b0203a3a4cce0f6024f9b68a"},
+ {file = "watchfiles-0.24.0-pp39-pypy39_pp73-macosx_10_12_x86_64.whl", hash = "sha256:96619302d4374de5e2345b2b622dc481257a99431277662c30f606f3e22f42be"},
+ {file = "watchfiles-0.24.0-pp39-pypy39_pp73-macosx_11_0_arm64.whl", hash = "sha256:85d5f0c7771dcc7a26c7a27145059b6bb0ce06e4e751ed76cdf123d7039b60b5"},
+ {file = "watchfiles-0.24.0-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:951088d12d339690a92cef2ec5d3cfd957692834c72ffd570ea76a6790222777"},
+ {file = "watchfiles-0.24.0-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:49fb58bcaa343fedc6a9e91f90195b20ccb3135447dc9e4e2570c3a39565853e"},
+ {file = "watchfiles-0.24.0.tar.gz", hash = "sha256:afb72325b74fa7a428c009c1b8be4b4d7c2afedafb2982827ef2156646df2fe1"},
+]
+
+[package.dependencies]
+anyio = ">=3.0.0"
+
[[package]]
name = "wcwidth"
version = "0.2.6"
@@ -9669,90 +10075,115 @@ tomli = ">=2.0.1"
[[package]]
name = "yarl"
-version = "1.9.2"
+version = "1.15.2"
description = "Yet another URL library"
optional = false
-python-versions = ">=3.7"
+python-versions = ">=3.8"
files = [
- {file = "yarl-1.9.2-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:8c2ad583743d16ddbdf6bb14b5cd76bf43b0d0006e918809d5d4ddf7bde8dd82"},
- {file = "yarl-1.9.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:82aa6264b36c50acfb2424ad5ca537a2060ab6de158a5bd2a72a032cc75b9eb8"},
- {file = "yarl-1.9.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:c0c77533b5ed4bcc38e943178ccae29b9bcf48ffd1063f5821192f23a1bd27b9"},
- {file = "yarl-1.9.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ee4afac41415d52d53a9833ebae7e32b344be72835bbb589018c9e938045a560"},
- {file = "yarl-1.9.2-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:9bf345c3a4f5ba7f766430f97f9cc1320786f19584acc7086491f45524a551ac"},
- {file = "yarl-1.9.2-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:2a96c19c52ff442a808c105901d0bdfd2e28575b3d5f82e2f5fd67e20dc5f4ea"},
- {file = "yarl-1.9.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:891c0e3ec5ec881541f6c5113d8df0315ce5440e244a716b95f2525b7b9f3608"},
- {file = "yarl-1.9.2-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c3a53ba34a636a256d767c086ceb111358876e1fb6b50dfc4d3f4951d40133d5"},
- {file = "yarl-1.9.2-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:566185e8ebc0898b11f8026447eacd02e46226716229cea8db37496c8cdd26e0"},
- {file = "yarl-1.9.2-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:2b0738fb871812722a0ac2154be1f049c6223b9f6f22eec352996b69775b36d4"},
- {file = "yarl-1.9.2-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:32f1d071b3f362c80f1a7d322bfd7b2d11e33d2adf395cc1dd4df36c9c243095"},
- {file = "yarl-1.9.2-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:e9fdc7ac0d42bc3ea78818557fab03af6181e076a2944f43c38684b4b6bed8e3"},
- {file = "yarl-1.9.2-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:56ff08ab5df8429901ebdc5d15941b59f6253393cb5da07b4170beefcf1b2528"},
- {file = "yarl-1.9.2-cp310-cp310-win32.whl", hash = "sha256:8ea48e0a2f931064469bdabca50c2f578b565fc446f302a79ba6cc0ee7f384d3"},
- {file = "yarl-1.9.2-cp310-cp310-win_amd64.whl", hash = "sha256:50f33040f3836e912ed16d212f6cc1efb3231a8a60526a407aeb66c1c1956dde"},
- {file = "yarl-1.9.2-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:646d663eb2232d7909e6601f1a9107e66f9791f290a1b3dc7057818fe44fc2b6"},
- {file = "yarl-1.9.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:aff634b15beff8902d1f918012fc2a42e0dbae6f469fce134c8a0dc51ca423bb"},
- {file = "yarl-1.9.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:a83503934c6273806aed765035716216cc9ab4e0364f7f066227e1aaea90b8d0"},
- {file = "yarl-1.9.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b25322201585c69abc7b0e89e72790469f7dad90d26754717f3310bfe30331c2"},
- {file = "yarl-1.9.2-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:22a94666751778629f1ec4280b08eb11815783c63f52092a5953faf73be24191"},
- {file = "yarl-1.9.2-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:8ec53a0ea2a80c5cd1ab397925f94bff59222aa3cf9c6da938ce05c9ec20428d"},
- {file = "yarl-1.9.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:159d81f22d7a43e6eabc36d7194cb53f2f15f498dbbfa8edc8a3239350f59fe7"},
- {file = "yarl-1.9.2-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:832b7e711027c114d79dffb92576acd1bd2decc467dec60e1cac96912602d0e6"},
- {file = "yarl-1.9.2-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:95d2ecefbcf4e744ea952d073c6922e72ee650ffc79028eb1e320e732898d7e8"},
- {file = "yarl-1.9.2-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:d4e2c6d555e77b37288eaf45b8f60f0737c9efa3452c6c44626a5455aeb250b9"},
- {file = "yarl-1.9.2-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:783185c75c12a017cc345015ea359cc801c3b29a2966c2655cd12b233bf5a2be"},
- {file = "yarl-1.9.2-cp311-cp311-musllinux_1_1_s390x.whl", hash = "sha256:b8cc1863402472f16c600e3e93d542b7e7542a540f95c30afd472e8e549fc3f7"},
- {file = "yarl-1.9.2-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:822b30a0f22e588b32d3120f6d41e4ed021806418b4c9f0bc3048b8c8cb3f92a"},
- {file = "yarl-1.9.2-cp311-cp311-win32.whl", hash = "sha256:a60347f234c2212a9f0361955007fcf4033a75bf600a33c88a0a8e91af77c0e8"},
- {file = "yarl-1.9.2-cp311-cp311-win_amd64.whl", hash = "sha256:be6b3fdec5c62f2a67cb3f8c6dbf56bbf3f61c0f046f84645cd1ca73532ea051"},
- {file = "yarl-1.9.2-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:38a3928ae37558bc1b559f67410df446d1fbfa87318b124bf5032c31e3447b74"},
- {file = "yarl-1.9.2-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ac9bb4c5ce3975aeac288cfcb5061ce60e0d14d92209e780c93954076c7c4367"},
- {file = "yarl-1.9.2-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:3da8a678ca8b96c8606bbb8bfacd99a12ad5dd288bc6f7979baddd62f71c63ef"},
- {file = "yarl-1.9.2-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:13414591ff516e04fcdee8dc051c13fd3db13b673c7a4cb1350e6b2ad9639ad3"},
- {file = "yarl-1.9.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bf74d08542c3a9ea97bb8f343d4fcbd4d8f91bba5ec9d5d7f792dbe727f88938"},
- {file = "yarl-1.9.2-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6e7221580dc1db478464cfeef9b03b95c5852cc22894e418562997df0d074ccc"},
- {file = "yarl-1.9.2-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:494053246b119b041960ddcd20fd76224149cfea8ed8777b687358727911dd33"},
- {file = "yarl-1.9.2-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:52a25809fcbecfc63ac9ba0c0fb586f90837f5425edfd1ec9f3372b119585e45"},
- {file = "yarl-1.9.2-cp37-cp37m-musllinux_1_1_ppc64le.whl", hash = "sha256:e65610c5792870d45d7b68c677681376fcf9cc1c289f23e8e8b39c1485384185"},
- {file = "yarl-1.9.2-cp37-cp37m-musllinux_1_1_s390x.whl", hash = "sha256:1b1bba902cba32cdec51fca038fd53f8beee88b77efc373968d1ed021024cc04"},
- {file = "yarl-1.9.2-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:662e6016409828ee910f5d9602a2729a8a57d74b163c89a837de3fea050c7582"},
- {file = "yarl-1.9.2-cp37-cp37m-win32.whl", hash = "sha256:f364d3480bffd3aa566e886587eaca7c8c04d74f6e8933f3f2c996b7f09bee1b"},
- {file = "yarl-1.9.2-cp37-cp37m-win_amd64.whl", hash = "sha256:6a5883464143ab3ae9ba68daae8e7c5c95b969462bbe42e2464d60e7e2698368"},
- {file = "yarl-1.9.2-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:5610f80cf43b6202e2c33ba3ec2ee0a2884f8f423c8f4f62906731d876ef4fac"},
- {file = "yarl-1.9.2-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:b9a4e67ad7b646cd6f0938c7ebfd60e481b7410f574c560e455e938d2da8e0f4"},
- {file = "yarl-1.9.2-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:83fcc480d7549ccebe9415d96d9263e2d4226798c37ebd18c930fce43dfb9574"},
- {file = "yarl-1.9.2-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5fcd436ea16fee7d4207c045b1e340020e58a2597301cfbcfdbe5abd2356c2fb"},
- {file = "yarl-1.9.2-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:84e0b1599334b1e1478db01b756e55937d4614f8654311eb26012091be109d59"},
- {file = "yarl-1.9.2-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:3458a24e4ea3fd8930e934c129b676c27452e4ebda80fbe47b56d8c6c7a63a9e"},
- {file = "yarl-1.9.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:838162460b3a08987546e881a2bfa573960bb559dfa739e7800ceeec92e64417"},
- {file = "yarl-1.9.2-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f4e2d08f07a3d7d3e12549052eb5ad3eab1c349c53ac51c209a0e5991bbada78"},
- {file = "yarl-1.9.2-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:de119f56f3c5f0e2fb4dee508531a32b069a5f2c6e827b272d1e0ff5ac040333"},
- {file = "yarl-1.9.2-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:149ddea5abf329752ea5051b61bd6c1d979e13fbf122d3a1f9f0c8be6cb6f63c"},
- {file = "yarl-1.9.2-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:674ca19cbee4a82c9f54e0d1eee28116e63bc6fd1e96c43031d11cbab8b2afd5"},
- {file = "yarl-1.9.2-cp38-cp38-musllinux_1_1_s390x.whl", hash = "sha256:9b3152f2f5677b997ae6c804b73da05a39daa6a9e85a512e0e6823d81cdad7cc"},
- {file = "yarl-1.9.2-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:5415d5a4b080dc9612b1b63cba008db84e908b95848369aa1da3686ae27b6d2b"},
- {file = "yarl-1.9.2-cp38-cp38-win32.whl", hash = "sha256:f7a3d8146575e08c29ed1cd287068e6d02f1c7bdff8970db96683b9591b86ee7"},
- {file = "yarl-1.9.2-cp38-cp38-win_amd64.whl", hash = "sha256:63c48f6cef34e6319a74c727376e95626f84ea091f92c0250a98e53e62c77c72"},
- {file = "yarl-1.9.2-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:75df5ef94c3fdc393c6b19d80e6ef1ecc9ae2f4263c09cacb178d871c02a5ba9"},
- {file = "yarl-1.9.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:c027a6e96ef77d401d8d5a5c8d6bc478e8042f1e448272e8d9752cb0aff8b5c8"},
- {file = "yarl-1.9.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:f3b078dbe227f79be488ffcfc7a9edb3409d018e0952cf13f15fd6512847f3f7"},
- {file = "yarl-1.9.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:59723a029760079b7d991a401386390c4be5bfec1e7dd83e25a6a0881859e716"},
- {file = "yarl-1.9.2-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b03917871bf859a81ccb180c9a2e6c1e04d2f6a51d953e6a5cdd70c93d4e5a2a"},
- {file = "yarl-1.9.2-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:c1012fa63eb6c032f3ce5d2171c267992ae0c00b9e164efe4d73db818465fac3"},
- {file = "yarl-1.9.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a74dcbfe780e62f4b5a062714576f16c2f3493a0394e555ab141bf0d746bb955"},
- {file = "yarl-1.9.2-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:8c56986609b057b4839968ba901944af91b8e92f1725d1a2d77cbac6972b9ed1"},
- {file = "yarl-1.9.2-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:2c315df3293cd521033533d242d15eab26583360b58f7ee5d9565f15fee1bef4"},
- {file = "yarl-1.9.2-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:b7232f8dfbd225d57340e441d8caf8652a6acd06b389ea2d3222b8bc89cbfca6"},
- {file = "yarl-1.9.2-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:53338749febd28935d55b41bf0bcc79d634881195a39f6b2f767870b72514caf"},
- {file = "yarl-1.9.2-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:066c163aec9d3d073dc9ffe5dd3ad05069bcb03fcaab8d221290ba99f9f69ee3"},
- {file = "yarl-1.9.2-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:8288d7cd28f8119b07dd49b7230d6b4562f9b61ee9a4ab02221060d21136be80"},
- {file = "yarl-1.9.2-cp39-cp39-win32.whl", hash = "sha256:b124e2a6d223b65ba8768d5706d103280914d61f5cae3afbc50fc3dfcc016623"},
- {file = "yarl-1.9.2-cp39-cp39-win_amd64.whl", hash = "sha256:61016e7d582bc46a5378ffdd02cd0314fb8ba52f40f9cf4d9a5e7dbef88dee18"},
- {file = "yarl-1.9.2.tar.gz", hash = "sha256:04ab9d4b9f587c06d801c2abfe9317b77cdf996c65a90d5e84ecc45010823571"},
+ {file = "yarl-1.15.2-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:e4ee8b8639070ff246ad3649294336b06db37a94bdea0d09ea491603e0be73b8"},
+ {file = "yarl-1.15.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:a7cf963a357c5f00cb55b1955df8bbe68d2f2f65de065160a1c26b85a1e44172"},
+ {file = "yarl-1.15.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:43ebdcc120e2ca679dba01a779333a8ea76b50547b55e812b8b92818d604662c"},
+ {file = "yarl-1.15.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3433da95b51a75692dcf6cc8117a31410447c75a9a8187888f02ad45c0a86c50"},
+ {file = "yarl-1.15.2-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:38d0124fa992dbacd0c48b1b755d3ee0a9f924f427f95b0ef376556a24debf01"},
+ {file = "yarl-1.15.2-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:ded1b1803151dd0f20a8945508786d57c2f97a50289b16f2629f85433e546d47"},
+ {file = "yarl-1.15.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ace4cad790f3bf872c082366c9edd7f8f8f77afe3992b134cfc810332206884f"},
+ {file = "yarl-1.15.2-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c77494a2f2282d9bbbbcab7c227a4d1b4bb829875c96251f66fb5f3bae4fb053"},
+ {file = "yarl-1.15.2-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:b7f227ca6db5a9fda0a2b935a2ea34a7267589ffc63c8045f0e4edb8d8dcf956"},
+ {file = "yarl-1.15.2-cp310-cp310-musllinux_1_2_armv7l.whl", hash = "sha256:31561a5b4d8dbef1559b3600b045607cf804bae040f64b5f5bca77da38084a8a"},
+ {file = "yarl-1.15.2-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:3e52474256a7db9dcf3c5f4ca0b300fdea6c21cca0148c8891d03a025649d935"},
+ {file = "yarl-1.15.2-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:0e1af74a9529a1137c67c887ed9cde62cff53aa4d84a3adbec329f9ec47a3936"},
+ {file = "yarl-1.15.2-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:15c87339490100c63472a76d87fe7097a0835c705eb5ae79fd96e343473629ed"},
+ {file = "yarl-1.15.2-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:74abb8709ea54cc483c4fb57fb17bb66f8e0f04438cff6ded322074dbd17c7ec"},
+ {file = "yarl-1.15.2-cp310-cp310-win32.whl", hash = "sha256:ffd591e22b22f9cb48e472529db6a47203c41c2c5911ff0a52e85723196c0d75"},
+ {file = "yarl-1.15.2-cp310-cp310-win_amd64.whl", hash = "sha256:1695497bb2a02a6de60064c9f077a4ae9c25c73624e0d43e3aa9d16d983073c2"},
+ {file = "yarl-1.15.2-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:9fcda20b2de7042cc35cf911702fa3d8311bd40055a14446c1e62403684afdc5"},
+ {file = "yarl-1.15.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:0545de8c688fbbf3088f9e8b801157923be4bf8e7b03e97c2ecd4dfa39e48e0e"},
+ {file = "yarl-1.15.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:fbda058a9a68bec347962595f50546a8a4a34fd7b0654a7b9697917dc2bf810d"},
+ {file = "yarl-1.15.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d1ac2bc069f4a458634c26b101c2341b18da85cb96afe0015990507efec2e417"},
+ {file = "yarl-1.15.2-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:cd126498171f752dd85737ab1544329a4520c53eed3997f9b08aefbafb1cc53b"},
+ {file = "yarl-1.15.2-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:3db817b4e95eb05c362e3b45dafe7144b18603e1211f4a5b36eb9522ecc62bcf"},
+ {file = "yarl-1.15.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:076b1ed2ac819933895b1a000904f62d615fe4533a5cf3e052ff9a1da560575c"},
+ {file = "yarl-1.15.2-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f8cfd847e6b9ecf9f2f2531c8427035f291ec286c0a4944b0a9fce58c6446046"},
+ {file = "yarl-1.15.2-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:32b66be100ac5739065496c74c4b7f3015cef792c3174982809274d7e51b3e04"},
+ {file = "yarl-1.15.2-cp311-cp311-musllinux_1_2_armv7l.whl", hash = "sha256:34a2d76a1984cac04ff8b1bfc939ec9dc0914821264d4a9c8fd0ed6aa8d4cfd2"},
+ {file = "yarl-1.15.2-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:0afad2cd484908f472c8fe2e8ef499facee54a0a6978be0e0cff67b1254fd747"},
+ {file = "yarl-1.15.2-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:c68e820879ff39992c7f148113b46efcd6ec765a4865581f2902b3c43a5f4bbb"},
+ {file = "yarl-1.15.2-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:98f68df80ec6ca3015186b2677c208c096d646ef37bbf8b49764ab4a38183931"},
+ {file = "yarl-1.15.2-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:3c56ec1eacd0a5d35b8a29f468659c47f4fe61b2cab948ca756c39b7617f0aa5"},
+ {file = "yarl-1.15.2-cp311-cp311-win32.whl", hash = "sha256:eedc3f247ee7b3808ea07205f3e7d7879bc19ad3e6222195cd5fbf9988853e4d"},
+ {file = "yarl-1.15.2-cp311-cp311-win_amd64.whl", hash = "sha256:0ccaa1bc98751fbfcf53dc8dfdb90d96e98838010fc254180dd6707a6e8bb179"},
+ {file = "yarl-1.15.2-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:82d5161e8cb8f36ec778fd7ac4d740415d84030f5b9ef8fe4da54784a1f46c94"},
+ {file = "yarl-1.15.2-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:fa2bea05ff0a8fb4d8124498e00e02398f06d23cdadd0fe027d84a3f7afde31e"},
+ {file = "yarl-1.15.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:99e12d2bf587b44deb74e0d6170fec37adb489964dbca656ec41a7cd8f2ff178"},
+ {file = "yarl-1.15.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:243fbbbf003754fe41b5bdf10ce1e7f80bcc70732b5b54222c124d6b4c2ab31c"},
+ {file = "yarl-1.15.2-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:856b7f1a7b98a8c31823285786bd566cf06226ac4f38b3ef462f593c608a9bd6"},
+ {file = "yarl-1.15.2-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:553dad9af802a9ad1a6525e7528152a015b85fb8dbf764ebfc755c695f488367"},
+ {file = "yarl-1.15.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:30c3ff305f6e06650a761c4393666f77384f1cc6c5c0251965d6bfa5fbc88f7f"},
+ {file = "yarl-1.15.2-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:353665775be69bbfc6d54c8d134bfc533e332149faeddd631b0bc79df0897f46"},
+ {file = "yarl-1.15.2-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:f4fe99ce44128c71233d0d72152db31ca119711dfc5f2c82385ad611d8d7f897"},
+ {file = "yarl-1.15.2-cp312-cp312-musllinux_1_2_armv7l.whl", hash = "sha256:9c1e3ff4b89cdd2e1a24c214f141e848b9e0451f08d7d4963cb4108d4d798f1f"},
+ {file = "yarl-1.15.2-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:711bdfae4e699a6d4f371137cbe9e740dc958530cb920eb6f43ff9551e17cfbc"},
+ {file = "yarl-1.15.2-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:4388c72174868884f76affcdd3656544c426407e0043c89b684d22fb265e04a5"},
+ {file = "yarl-1.15.2-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:f0e1844ad47c7bd5d6fa784f1d4accc5f4168b48999303a868fe0f8597bde715"},
+ {file = "yarl-1.15.2-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:a5cafb02cf097a82d74403f7e0b6b9df3ffbfe8edf9415ea816314711764a27b"},
+ {file = "yarl-1.15.2-cp312-cp312-win32.whl", hash = "sha256:156ececdf636143f508770bf8a3a0498de64da5abd890c7dbb42ca9e3b6c05b8"},
+ {file = "yarl-1.15.2-cp312-cp312-win_amd64.whl", hash = "sha256:435aca062444a7f0c884861d2e3ea79883bd1cd19d0a381928b69ae1b85bc51d"},
+ {file = "yarl-1.15.2-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:416f2e3beaeae81e2f7a45dc711258be5bdc79c940a9a270b266c0bec038fb84"},
+ {file = "yarl-1.15.2-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:173563f3696124372831007e3d4b9821746964a95968628f7075d9231ac6bb33"},
+ {file = "yarl-1.15.2-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:9ce2e0f6123a60bd1a7f5ae3b2c49b240c12c132847f17aa990b841a417598a2"},
+ {file = "yarl-1.15.2-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:eaea112aed589131f73d50d570a6864728bd7c0c66ef6c9154ed7b59f24da611"},
+ {file = "yarl-1.15.2-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:e4ca3b9f370f218cc2a0309542cab8d0acdfd66667e7c37d04d617012485f904"},
+ {file = "yarl-1.15.2-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:23ec1d3c31882b2a8a69c801ef58ebf7bae2553211ebbddf04235be275a38548"},
+ {file = "yarl-1.15.2-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:75119badf45f7183e10e348edff5a76a94dc19ba9287d94001ff05e81475967b"},
+ {file = "yarl-1.15.2-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:78e6fdc976ec966b99e4daa3812fac0274cc28cd2b24b0d92462e2e5ef90d368"},
+ {file = "yarl-1.15.2-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:8657d3f37f781d987037f9cc20bbc8b40425fa14380c87da0cb8dfce7c92d0fb"},
+ {file = "yarl-1.15.2-cp313-cp313-musllinux_1_2_armv7l.whl", hash = "sha256:93bed8a8084544c6efe8856c362af08a23e959340c87a95687fdbe9c9f280c8b"},
+ {file = "yarl-1.15.2-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:69d5856d526802cbda768d3e6246cd0d77450fa2a4bc2ea0ea14f0d972c2894b"},
+ {file = "yarl-1.15.2-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:ccad2800dfdff34392448c4bf834be124f10a5bc102f254521d931c1c53c455a"},
+ {file = "yarl-1.15.2-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:a880372e2e5dbb9258a4e8ff43f13888039abb9dd6d515f28611c54361bc5644"},
+ {file = "yarl-1.15.2-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:c998d0558805860503bc3a595994895ca0f7835e00668dadc673bbf7f5fbfcbe"},
+ {file = "yarl-1.15.2-cp313-cp313-win32.whl", hash = "sha256:533a28754e7f7439f217550a497bb026c54072dbe16402b183fdbca2431935a9"},
+ {file = "yarl-1.15.2-cp313-cp313-win_amd64.whl", hash = "sha256:5838f2b79dc8f96fdc44077c9e4e2e33d7089b10788464609df788eb97d03aad"},
+ {file = "yarl-1.15.2-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:fbbb63bed5fcd70cd3dd23a087cd78e4675fb5a2963b8af53f945cbbca79ae16"},
+ {file = "yarl-1.15.2-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:e2e93b88ecc8f74074012e18d679fb2e9c746f2a56f79cd5e2b1afcf2a8a786b"},
+ {file = "yarl-1.15.2-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:af8ff8d7dc07ce873f643de6dfbcd45dc3db2c87462e5c387267197f59e6d776"},
+ {file = "yarl-1.15.2-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:66f629632220a4e7858b58e4857927dd01a850a4cef2fb4044c8662787165cf7"},
+ {file = "yarl-1.15.2-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:833547179c31f9bec39b49601d282d6f0ea1633620701288934c5f66d88c3e50"},
+ {file = "yarl-1.15.2-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:2aa738e0282be54eede1e3f36b81f1e46aee7ec7602aa563e81e0e8d7b67963f"},
+ {file = "yarl-1.15.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9a13a07532e8e1c4a5a3afff0ca4553da23409fad65def1b71186fb867eeae8d"},
+ {file = "yarl-1.15.2-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c45817e3e6972109d1a2c65091504a537e257bc3c885b4e78a95baa96df6a3f8"},
+ {file = "yarl-1.15.2-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:670eb11325ed3a6209339974b276811867defe52f4188fe18dc49855774fa9cf"},
+ {file = "yarl-1.15.2-cp38-cp38-musllinux_1_2_armv7l.whl", hash = "sha256:d417a4f6943112fae3924bae2af7112562285848d9bcee737fc4ff7cbd450e6c"},
+ {file = "yarl-1.15.2-cp38-cp38-musllinux_1_2_i686.whl", hash = "sha256:bc8936d06cd53fddd4892677d65e98af514c8d78c79864f418bbf78a4a2edde4"},
+ {file = "yarl-1.15.2-cp38-cp38-musllinux_1_2_ppc64le.whl", hash = "sha256:954dde77c404084c2544e572f342aef384240b3e434e06cecc71597e95fd1ce7"},
+ {file = "yarl-1.15.2-cp38-cp38-musllinux_1_2_s390x.whl", hash = "sha256:5bc0df728e4def5e15a754521e8882ba5a5121bd6b5a3a0ff7efda5d6558ab3d"},
+ {file = "yarl-1.15.2-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:b71862a652f50babab4a43a487f157d26b464b1dedbcc0afda02fd64f3809d04"},
+ {file = "yarl-1.15.2-cp38-cp38-win32.whl", hash = "sha256:63eab904f8630aed5a68f2d0aeab565dcfc595dc1bf0b91b71d9ddd43dea3aea"},
+ {file = "yarl-1.15.2-cp38-cp38-win_amd64.whl", hash = "sha256:2cf441c4b6e538ba0d2591574f95d3fdd33f1efafa864faa077d9636ecc0c4e9"},
+ {file = "yarl-1.15.2-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:a32d58f4b521bb98b2c0aa9da407f8bd57ca81f34362bcb090e4a79e9924fefc"},
+ {file = "yarl-1.15.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:766dcc00b943c089349d4060b935c76281f6be225e39994c2ccec3a2a36ad627"},
+ {file = "yarl-1.15.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:bed1b5dbf90bad3bfc19439258c97873eab453c71d8b6869c136346acfe497e7"},
+ {file = "yarl-1.15.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ed20a4bdc635f36cb19e630bfc644181dd075839b6fc84cac51c0f381ac472e2"},
+ {file = "yarl-1.15.2-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d538df442c0d9665664ab6dd5fccd0110fa3b364914f9c85b3ef9b7b2e157980"},
+ {file = "yarl-1.15.2-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:28c6cf1d92edf936ceedc7afa61b07e9d78a27b15244aa46bbcd534c7458ee1b"},
+ {file = "yarl-1.15.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ce44217ad99ffad8027d2fde0269ae368c86db66ea0571c62a000798d69401fb"},
+ {file = "yarl-1.15.2-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b47a6000a7e833ebfe5886b56a31cb2ff12120b1efd4578a6fcc38df16cc77bd"},
+ {file = "yarl-1.15.2-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:e52f77a0cd246086afde8815039f3e16f8d2be51786c0a39b57104c563c5cbb0"},
+ {file = "yarl-1.15.2-cp39-cp39-musllinux_1_2_armv7l.whl", hash = "sha256:f9ca0e6ce7774dc7830dc0cc4bb6b3eec769db667f230e7c770a628c1aa5681b"},
+ {file = "yarl-1.15.2-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:136f9db0f53c0206db38b8cd0c985c78ded5fd596c9a86ce5c0b92afb91c3a19"},
+ {file = "yarl-1.15.2-cp39-cp39-musllinux_1_2_ppc64le.whl", hash = "sha256:173866d9f7409c0fb514cf6e78952e65816600cb888c68b37b41147349fe0057"},
+ {file = "yarl-1.15.2-cp39-cp39-musllinux_1_2_s390x.whl", hash = "sha256:6e840553c9c494a35e449a987ca2c4f8372668ee954a03a9a9685075228e5036"},
+ {file = "yarl-1.15.2-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:458c0c65802d816a6b955cf3603186de79e8fdb46d4f19abaec4ef0a906f50a7"},
+ {file = "yarl-1.15.2-cp39-cp39-win32.whl", hash = "sha256:5b48388ded01f6f2429a8c55012bdbd1c2a0c3735b3e73e221649e524c34a58d"},
+ {file = "yarl-1.15.2-cp39-cp39-win_amd64.whl", hash = "sha256:81dadafb3aa124f86dc267a2168f71bbd2bfb163663661ab0038f6e4b8edb810"},
+ {file = "yarl-1.15.2-py3-none-any.whl", hash = "sha256:0d3105efab7c5c091609abacad33afff33bdff0035bece164c98bcf5a85ef90a"},
+ {file = "yarl-1.15.2.tar.gz", hash = "sha256:a39c36f4218a5bb668b4f06874d676d35a035ee668e6e7e3538835c703634b84"},
]
[package.dependencies]
idna = ">=2.0"
multidict = ">=4.0"
+propcache = ">=0.2.0"
[[package]]
name = "zipp"
@@ -9861,4 +10292,4 @@ weaviate = ["weaviate-client"]
[metadata]
lock-version = "2.0"
python-versions = ">=3.8.1,<3.13"
-content-hash = "97666ad4613f07d95c5388bae41befe6cc10c88d02ee8f1cee27b161e13729f1"
+content-hash = "f932b4e28b8d08489ca58a67e6844006a21dc73d453c3eae8469cd57760bb891"
diff --git a/pyproject.toml b/pyproject.toml
index cdf7c33cb3..6eb162ac55 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
[tool.poetry]
name = "dlt"
-version = "1.3.0"
+version = "1.4.0"
description = "dlt is an open-source python-first scalable data loading library that does not require any backend to run."
authors = ["dltHub Inc. "]
maintainers = [ "Marcin Rudolf ", "Adrian Brudaru ", "Anton Burnashev ", "David Scharf " ]
@@ -83,12 +83,13 @@ clickhouse-driver = { version = ">=0.2.7", optional = true }
clickhouse-connect = { version = ">=0.7.7", optional = true }
lancedb = { version = ">=0.8.2", optional = true, markers = "python_version >= '3.9'", allow-prereleases = true }
tantivy = { version = ">= 0.22.0", optional = true }
-deltalake = { version = ">=0.19.0", optional = true }
+deltalake = { version = ">=0.21.0", optional = true }
sqlalchemy = { version = ">=1.4", optional = true }
alembic = {version = ">1.10.0", optional = true}
paramiko = {version = ">=3.3.0", optional = true}
sqlglot = {version = ">=20.0.0", optional = true}
db-dtypes = { version = ">=1.2.0", optional = true }
+aiohttp = { version = ">=3.9", optional = true }
[tool.poetry.extras]
gcp = ["grpcio", "google-cloud-bigquery", "db-dtypes", "gcsfs"]
@@ -239,11 +240,12 @@ dbt-duckdb = ">=1.2.0"
pymongo = ">=4.3.3"
pandas = ">2"
alive-progress = ">=3.0.1"
-pyarrow = ">=14.0.0"
+pyarrow = ">=17.0.0"
psycopg2-binary = ">=2.9"
lancedb = { version = ">=0.8.2", markers = "python_version >= '3.9'", allow-prereleases = true }
openai = ">=1.45"
connectorx = { version = ">=0.3.2" }
+modal = ">=0.64.170"
[tool.black] # https://black.readthedocs.io/en/stable/usage_and_configuration/the_basics.html#configuration-via-a-file
line-length = 100
diff --git a/tests/cli/common/test_telemetry_command.py b/tests/cli/common/test_telemetry_command.py
index 4daa5f63ef..f165ff63b1 100644
--- a/tests/cli/common/test_telemetry_command.py
+++ b/tests/cli/common/test_telemetry_command.py
@@ -49,6 +49,10 @@ def _initial_providers(self):
# make sure no config.toml exists in project (it is not created if it was not already there)
project_dot = os.path.join("project", DOT_DLT)
assert not test_storage.has_folder(project_dot)
+ # load global config
+ global_toml = ConfigTomlProvider(run_context.global_dir)
+ assert global_toml._config_doc["runtime"]["dlthub_telemetry"] is False
+
# enable telemetry
with io.StringIO() as buf, contextlib.redirect_stdout(buf):
change_telemetry_status_command(True)
@@ -56,6 +60,10 @@ def _initial_providers(self):
output = buf.getvalue()
assert "ON" in output
assert "ENABLED" in output
+ # load global config
+ global_toml = ConfigTomlProvider(run_context.global_dir)
+ assert global_toml._config_doc["runtime"]["dlthub_telemetry"] is True
+
# create config toml in project dir
test_storage.create_folder(project_dot)
test_storage.save(os.path.join("project", DOT_DLT, CONFIG_TOML), "# empty")
@@ -67,10 +75,14 @@ def _initial_providers(self):
output = buf.getvalue()
assert "OFF" in output
assert "DISABLED" in output
- # load local config provider
- project_toml = ConfigTomlProvider(run_context.settings_dir)
- # local project toml was modified
- assert project_toml._config_doc["runtime"]["dlthub_telemetry"] is False
+
+ # load global config provider
+ global_toml = ConfigTomlProvider(run_context.global_dir)
+ assert global_toml._config_doc["runtime"]["dlthub_telemetry"] is False
+ # load local config provider
+ project_toml = ConfigTomlProvider(run_context.settings_dir)
+ # local project toml was modified
+ assert project_toml._config_doc["runtime"]["dlthub_telemetry"] is False
def test_command_instrumentation() -> None:
diff --git a/tests/cli/test_config_toml_writer.py b/tests/cli/test_config_toml_writer.py
index 8ccac21f99..31c6f524a7 100644
--- a/tests/cli/test_config_toml_writer.py
+++ b/tests/cli/test_config_toml_writer.py
@@ -1,8 +1,10 @@
-from typing import Optional, Final
+from typing import ClassVar, List, Optional, Final
import pytest
import tomlkit
from dlt.cli.config_toml_writer import write_value, WritableConfigValue, write_values
+from dlt.common.configuration.specs import configspec
+from dlt.common.destination.reference import DEFAULT_FILE_LAYOUT
EXAMPLE_COMMENT = "# please set me up!"
@@ -159,3 +161,95 @@ def test_write_values_without_defaults(example_toml):
assert example_toml["genomic_info"]["gene_data"]["genes"] == {"key": "value"}
assert example_toml["genomic_info"]["gene_data"]["genes"].trivia.comment == EXAMPLE_COMMENT
+
+
+def test_write_spec_without_defaults(example_toml) -> None:
+ from dlt.destinations.impl.snowflake.configuration import SnowflakeClientConfiguration
+ from dlt.destinations.impl.filesystem.configuration import (
+ FilesystemDestinationClientConfiguration,
+ )
+
+ write_value(
+ example_toml, "snowflake", SnowflakeClientConfiguration, False, is_default_of_interest=True
+ )
+ # nothing of interest in "snowflake"
+ # host, database, username are required and will be included
+ # "password", "warehouse", "role" are explicitly of interest
+ assert example_toml.as_string() == """[snowflake.credentials]
+database = "database" # please set me up!
+password = "password" # please set me up!
+username = "username" # please set me up!
+host = "host" # please set me up!
+warehouse = "warehouse" # please set me up!
+role = "role" # please set me up!
+"""
+ example_toml = tomlkit.parse("")
+ write_value(
+ example_toml,
+ "filesystem",
+ FilesystemDestinationClientConfiguration,
+ False,
+ is_default_of_interest=True,
+ )
+
+ # bucket_url is mandatory, same for aws credentials
+ assert example_toml.as_string() == """[filesystem]
+bucket_url = "bucket_url" # please set me up!
+
+[filesystem.credentials]
+aws_access_key_id = "aws_access_key_id" # please set me up!
+aws_secret_access_key = "aws_secret_access_key" # please set me up!
+"""
+
+ @configspec
+ class SnowflakeDatabaseConfiguration(SnowflakeClientConfiguration):
+ database: str = "dlt_db"
+
+ __config_gen_annotations__: ClassVar[List[str]] = ["database"]
+
+ example_toml = tomlkit.parse("")
+ write_value(
+ example_toml,
+ "snowflake",
+ SnowflakeDatabaseConfiguration,
+ False,
+ is_default_of_interest=True,
+ )
+
+ # uses default value
+ assert example_toml["snowflake"]["database"] == "dlt_db"
+
+ # use initial values
+ example_toml = tomlkit.parse("")
+ write_value(
+ example_toml,
+ "filesystem",
+ FilesystemDestinationClientConfiguration,
+ False,
+ is_default_of_interest=True,
+ default_value={
+ "bucket_url": "az://test-az-bucket",
+ "layout": DEFAULT_FILE_LAYOUT,
+ "credentials": {"region_name": "eu"},
+ },
+ )
+ assert example_toml["filesystem"]["bucket_url"] == "az://test-az-bucket"
+ # TODO: choose right credentials based on bucket_url
+ assert example_toml["filesystem"]["credentials"]["aws_access_key_id"] == "aws_access_key_id"
+ # if initial value is different from the default then it is included
+ assert example_toml["filesystem"]["credentials"]["region_name"] == "eu"
+ # this is same as default so not included
+ assert "layout" not in example_toml["filesystem"]
+
+ example_toml = tomlkit.parse("")
+ write_value(
+ example_toml,
+ "snowflake",
+ SnowflakeDatabaseConfiguration,
+ False,
+ is_default_of_interest=True,
+ default_value={"database": "dlt_db"},
+ )
+
+ # still here because marked specifically as of interest
+ assert example_toml["snowflake"]["database"] == "dlt_db"
diff --git a/tests/cli/test_init_command.py b/tests/cli/test_init_command.py
index d4ee1844d7..8e1affd164 100644
--- a/tests/cli/test_init_command.py
+++ b/tests/cli/test_init_command.py
@@ -4,7 +4,7 @@
import os
import contextlib
from subprocess import CalledProcessError
-from typing import Any, List, Tuple, Optional
+from typing import List, Tuple, Optional
from hexbytes import HexBytes
import pytest
from unittest import mock
@@ -55,7 +55,12 @@
# we hardcode the core sources here so we can check that the init script picks
# up the right source
-CORE_SOURCES = ["filesystem", "rest_api", "sql_database"]
+CORE_SOURCES_CONFIG = {
+ "rest_api": {"requires_extra": False},
+ "sql_database": {"requires_extra": True},
+ "filesystem": {"requires_extra": True},
+}
+CORE_SOURCES = list(CORE_SOURCES_CONFIG.keys())
# we also hardcode all the templates here for testing
TEMPLATES = ["debug", "default", "arrow", "requests", "dataframe", "fruitshop", "github_api"]
@@ -167,6 +172,37 @@ def test_init_list_sources(repo_dir: str) -> None:
assert source in _out
+@pytest.mark.parametrize(
+ "source_name",
+ [name for name in CORE_SOURCES_CONFIG if CORE_SOURCES_CONFIG[name]["requires_extra"]],
+)
+def test_init_command_core_source_requirements_with_extras(
+ source_name: str, repo_dir: str, project_files: FileStorage
+) -> None:
+ init_command.init_command(source_name, "duckdb", repo_dir)
+ source_requirements = SourceRequirements.from_string(
+ project_files.load(cli_utils.REQUIREMENTS_TXT)
+ )
+ canonical_name = source_name.replace("_", "-")
+ assert canonical_name in source_requirements.dlt_requirement.extras
+
+
+@pytest.mark.parametrize(
+ "source_name",
+ [name for name in CORE_SOURCES_CONFIG if not CORE_SOURCES_CONFIG[name]["requires_extra"]],
+)
+def test_init_command_core_source_requirements_without_extras(
+ source_name: str, repo_dir: str, project_files: FileStorage
+) -> None:
+ init_command.init_command(source_name, "duckdb", repo_dir)
+ source_requirements = SourceRequirements.from_string(
+ project_files.load(cli_utils.REQUIREMENTS_TXT)
+ )
+ assert source_requirements.dlt_requirement.extras == {
+ "duckdb"
+ }, "Only duckdb should be in extras"
+
+
def test_init_list_sources_update_warning(repo_dir: str, project_files: FileStorage) -> None:
"""Sources listed include a warning if a different dlt version is required"""
with mock.patch.object(SourceRequirements, "current_dlt_version", return_value="0.0.1"):
@@ -571,7 +607,7 @@ def assert_requirements_txt(project_files: FileStorage, destination_name: str) -
project_files.load(cli_utils.REQUIREMENTS_TXT)
)
assert destination_name in source_requirements.dlt_requirement.extras
- # Check that atleast some version range is specified
+ # Check that at least some version range is specified
assert len(source_requirements.dlt_requirement.specifier) >= 1
diff --git a/tests/common/cases/modules/google/colab/__init__.py b/tests/common/cases/modules/google/colab/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/tests/common/cases/modules/google/colab/userdata.py b/tests/common/cases/modules/google/colab/userdata.py
new file mode 100644
index 0000000000..f341de32d1
--- /dev/null
+++ b/tests/common/cases/modules/google/colab/userdata.py
@@ -0,0 +1,16 @@
+"""Mocked colab userdata"""
+
+
+class SecretNotFoundError(Exception):
+ pass
+
+
+class NotebookAccessError(Exception):
+ pass
+
+
+def get(secret_name: str) -> str:
+ if secret_name == "secrets.toml":
+ return 'api_key="api"'
+
+ raise SecretNotFoundError()
diff --git a/tests/common/configuration/test_configuration.py b/tests/common/configuration/test_configuration.py
index 8d55e02a87..00a28d652e 100644
--- a/tests/common/configuration/test_configuration.py
+++ b/tests/common/configuration/test_configuration.py
@@ -374,7 +374,7 @@ def test_default_values(environment: Any) -> None:
assert c.none_val == type(environment)
-def test_raises_on_final_value_change(environment: Any) -> None:
+def test_final_ignores_value_change(environment: Any) -> None:
@configspec
class FinalConfiguration(BaseConfiguration):
pipeline_name: Final[str] = "comp"
@@ -387,6 +387,12 @@ class FinalConfiguration(BaseConfiguration):
# config providers are ignored for final fields
assert c.pipeline_name == "comp"
+ # explicit values always work
+ c = resolve.resolve_configuration(FinalConfiguration(), explicit_value={"pipeline_name": "exp"})
+ assert c.pipeline_name == "exp"
+ with pytest.raises(ConfigFieldMissingException):
+ resolve.resolve_configuration(FinalConfiguration(), explicit_value={"pipeline_name": None})
+
@configspec
class FinalConfiguration2(BaseConfiguration):
pipeline_name: Final[str] = None
@@ -394,6 +400,53 @@ class FinalConfiguration2(BaseConfiguration):
c2 = resolve.resolve_configuration(FinalConfiguration2())
assert dict(c2) == {"pipeline_name": None}
+ c2 = resolve.resolve_configuration(
+ FinalConfiguration2(), explicit_value={"pipeline_name": "exp"}
+ )
+ assert c.pipeline_name == "exp"
+ with pytest.raises(ConfigFieldMissingException):
+ resolve.resolve_configuration(FinalConfiguration2(), explicit_value={"pipeline_name": None})
+
+
+def test_not_resolved_ignores_value_change(environment: Any) -> None:
+ @configspec
+ class NotResolvedConfiguration(BaseConfiguration):
+ pipeline_name: Annotated[str, NotResolved()] = "comp"
+
+ c = resolve.resolve_configuration(NotResolvedConfiguration())
+ assert dict(c) == {"pipeline_name": "comp"}
+
+ environment["PIPELINE_NAME"] = "env name"
+ c = resolve.resolve_configuration(NotResolvedConfiguration())
+ # config providers are ignored for final fields
+ assert c.pipeline_name == "comp"
+
+ # explicit values always work
+ c = resolve.resolve_configuration(
+ NotResolvedConfiguration(), explicit_value={"pipeline_name": "exp"}
+ )
+ assert c.pipeline_name == "exp"
+ with pytest.raises(ConfigFieldMissingException):
+ resolve.resolve_configuration(
+ NotResolvedConfiguration(), explicit_value={"pipeline_name": None}
+ )
+
+ @configspec
+ class NotResolvedConfiguration2(BaseConfiguration):
+ pipeline_name: Annotated[str, NotResolved()] = None
+
+ c2 = resolve.resolve_configuration(NotResolvedConfiguration2())
+ assert dict(c2) == {"pipeline_name": None}
+
+ c2 = resolve.resolve_configuration(
+ NotResolvedConfiguration2(), explicit_value={"pipeline_name": "exp"}
+ )
+ assert c.pipeline_name == "exp"
+ with pytest.raises(ConfigFieldMissingException):
+ resolve.resolve_configuration(
+ NotResolvedConfiguration2(), explicit_value={"pipeline_name": None}
+ )
+
def test_explicit_native_always_skips_resolve(environment: Any) -> None:
# make the instance sectioned so it can read from INSTRUMENTED
@@ -1055,6 +1108,13 @@ class NotResolveConfiguration(BaseConfiguration):
assert c2.trace is s1
assert c2.traces[0] is s2
+ # also explicit values will write to NotResolvable
+ c2 = resolve.resolve_configuration(
+ NotResolveConfiguration(), explicit_value={"trace": s1, "traces": [s2]}
+ )
+ assert c2.trace is s1
+ assert c2.traces[0] is s2
+
def test_configspec_auto_base_config_derivation() -> None:
@configspec
diff --git a/tests/common/configuration/test_toml_provider.py b/tests/common/configuration/test_toml_provider.py
index 9e192a984d..481c21b7bb 100644
--- a/tests/common/configuration/test_toml_provider.py
+++ b/tests/common/configuration/test_toml_provider.py
@@ -1,4 +1,5 @@
import os
+import sys
import pytest
import yaml
from typing import Any, Dict, Type
@@ -32,7 +33,7 @@
from dlt.common.runners.configuration import PoolRunnerConfiguration
from dlt.common.typing import TSecretValue
-from tests.utils import preserve_environ
+from tests.utils import preserve_environ, unload_modules
from tests.common.configuration.utils import (
ConnectionStringCompatCredentials,
SecretCredentials,
@@ -534,3 +535,20 @@ def loader() -> Dict[str, Any]:
),
)
assert config.username == "dlt-loader"
+
+
+def test_colab_toml() -> None:
+ # use a path without any settings files
+ try:
+ sys.path.append("tests/common/cases/modules")
+ # secrets are in user data
+ provider: SettingsTomlProvider = SecretsTomlProvider("tests/common/null", global_dir=None)
+ assert provider.to_toml() == 'api_key="api"'
+ # config is not in userdata
+ provider = ConfigTomlProvider("tests/common/null", "unknown")
+ assert provider.is_empty
+ # prefers files
+ provider = SecretsTomlProvider("tests/common/cases/configuration/.dlt", global_dir=None)
+ assert provider.get_value("secret_value", str, None) == ("2137", "secret_value")
+ finally:
+ sys.path.pop()
diff --git a/tests/common/destination/test_reference.py b/tests/common/destination/test_reference.py
index f04820bf36..93eef793d5 100644
--- a/tests/common/destination/test_reference.py
+++ b/tests/common/destination/test_reference.py
@@ -390,12 +390,12 @@ def test_normalize_staging_dataset_name() -> None:
.normalize_staging_dataset_name(Schema("private"))
== "static_staging"
)
- # empty dataset -> empty staging
+ # empty dataset -> placeholder still applied
assert (
DestinationClientDwhConfiguration()
._bind_dataset_name(dataset_name=None, default_schema_name="private")
.normalize_staging_dataset_name(Schema("private"))
- is None
+ == "_staging"
)
assert (
DestinationClientDwhConfiguration(staging_dataset_name_layout="static_staging")
diff --git a/tests/common/normalizers/test_json_relational.py b/tests/common/normalizers/test_json_relational.py
index 748259cba1..1553cea04f 100644
--- a/tests/common/normalizers/test_json_relational.py
+++ b/tests/common/normalizers/test_json_relational.py
@@ -29,7 +29,7 @@ def test_flatten_fix_field_name(norm: RelationalNormalizer) -> None:
"f 2": [],
"f!3": {"f4": "a", "f-5": "b", "f*6": {"c": 7, "c v": 8, "c x": []}},
}
- flattened_row, lists = norm._flatten("mock_table", row, 0)
+ flattened_row, lists = norm._flatten("mock_table", row, 1000)
assert "f_1" in flattened_row
# assert "f_2" in flattened_row
assert "f_3__f4" in flattened_row
@@ -93,7 +93,7 @@ def test_nested_table_linking(norm: RelationalNormalizer) -> None:
# request _dlt_root_id propagation
add_dlt_root_id_propagation(norm)
- rows = list(norm._normalize_row(row, {}, ("table",)))
+ rows = list(norm._normalize_row(row, {}, ("table",), _r_lvl=1000, is_root=True))
# should have 7 entries (root + level 1 + 3 * list + 2 * object)
assert len(rows) == 7
# root elem will not have a root hash if not explicitly added, "extend" is added only to child
@@ -144,7 +144,7 @@ def test_skip_nested_link_when_no_parent(norm: RelationalNormalizer) -> None:
table__f = new_table("table__f", parent_table_name=None)
norm.schema.update_table(table__f)
- rows = list(norm._normalize_row(row, {}, ("table",)))
+ rows = list(norm._normalize_row(row, {}, ("table",), _r_lvl=1000, is_root=True))
root = next(t for t in rows if t[0][0] == "table")[1]
# record hash is random for primary keys, not based on their content
# this is a change introduced in dlt 0.2.0a30
@@ -174,7 +174,7 @@ def test_yields_parents_first(norm: RelationalNormalizer) -> None:
"f": [{"id": "level1", "l": ["a", "b", "c"], "v": 120, "o": [{"a": 1}, {"a": 2}]}],
"g": [{"id": "level2_g", "l": ["a"]}],
}
- rows = list(norm._normalize_row(row, {}, ("table",)))
+ rows = list(norm._normalize_row(row, {}, ("table",), _r_lvl=1000, is_root=True))
tables = list(r[0][0] for r in rows)
# child tables are always yielded before parent tables
expected_tables = [
@@ -220,7 +220,7 @@ def test_yields_parent_relation(norm: RelationalNormalizer) -> None:
}
],
}
- rows = list(norm._normalize_row(row, {}, ("table",)))
+ rows = list(norm._normalize_row(row, {}, ("table",), _r_lvl=1000, is_root=True))
# normalizer must return parent table first and move in order of the list elements when yielding child tables
# the yielding order if fully defined
expected_parents = [
@@ -281,7 +281,7 @@ def test_list_position(norm: RelationalNormalizer) -> None:
row: DictStrAny = {
"f": [{"l": ["a", "b", "c"], "v": 120, "lo": [{"e": "a"}, {"e": "b"}, {"e": "c"}]}]
}
- rows = list(norm._normalize_row(row, {}, ("table",)))
+ rows = list(norm._normalize_row(row, {}, ("table",), _r_lvl=1000, is_root=True))
# root has no pos
root = [t for t in rows if t[0][0] == "table"][0][1]
assert "_dlt_list_idx" not in root
@@ -436,7 +436,7 @@ def test_child_row_deterministic_hash(norm: RelationalNormalizer) -> None:
"_dlt_id": row_id,
"f": [{"l": ["a", "b", "c"], "v": 120, "lo": [{"e": "a"}, {"e": "b"}, {"e": "c"}]}],
}
- rows = list(norm._normalize_row(row, {}, ("table",)))
+ rows = list(norm._normalize_row(row, {}, ("table",), _r_lvl=1000, is_root=True))
children = [t for t in rows if t[0][0] != "table"]
# all hashes must be different
distinct_hashes = set([ch[1]["_dlt_id"] for ch in children])
@@ -455,19 +455,19 @@ def test_child_row_deterministic_hash(norm: RelationalNormalizer) -> None:
assert f_lo_p2["_dlt_id"] == digest128(f"{el_f['_dlt_id']}_table__f__lo_2", DLT_ID_LENGTH_BYTES)
# same data with same table and row_id
- rows_2 = list(norm._normalize_row(row, {}, ("table",)))
+ rows_2 = list(norm._normalize_row(row, {}, ("table",), _r_lvl=1000, is_root=True))
children_2 = [t for t in rows_2 if t[0][0] != "table"]
# corresponding hashes must be identical
assert all(ch[0][1]["_dlt_id"] == ch[1][1]["_dlt_id"] for ch in zip(children, children_2))
# change parent table and all child hashes must be different
- rows_4 = list(norm._normalize_row(row, {}, ("other_table",)))
+ rows_4 = list(norm._normalize_row(row, {}, ("other_table",), _r_lvl=1000, is_root=True))
children_4 = [t for t in rows_4 if t[0][0] != "other_table"]
assert all(ch[0][1]["_dlt_id"] != ch[1][1]["_dlt_id"] for ch in zip(children, children_4))
# change parent hash and all child hashes must be different
row["_dlt_id"] = uniq_id()
- rows_3 = list(norm._normalize_row(row, {}, ("table",)))
+ rows_3 = list(norm._normalize_row(row, {}, ("table",), _r_lvl=1000, is_root=True))
children_3 = [t for t in rows_3 if t[0][0] != "table"]
assert all(ch[0][1]["_dlt_id"] != ch[1][1]["_dlt_id"] for ch in zip(children, children_3))
@@ -483,7 +483,13 @@ def test_keeps_dlt_id(norm: RelationalNormalizer) -> None:
def test_propagate_hardcoded_context(norm: RelationalNormalizer) -> None:
row = {"level": 1, "list": ["a", "b", "c"], "comp": [{"_timestamp": "a"}]}
rows = list(
- norm._normalize_row(row, {"_timestamp": 1238.9, "_dist_key": "SENDER_3000"}, ("table",))
+ norm._normalize_row(
+ row,
+ {"_timestamp": 1238.9, "_dist_key": "SENDER_3000"},
+ ("table",),
+ _r_lvl=1000,
+ is_root=True,
+ )
)
# context is not added to root element
root = next(t for t in rows if t[0][0] == "table")[1]
@@ -514,7 +520,7 @@ def test_propagates_root_context(norm: RelationalNormalizer) -> None:
"dependent_list": [1, 2, 3],
"dependent_objects": [{"vx": "ax"}],
}
- normalized_rows = list(norm._normalize_row(row, {}, ("table",)))
+ normalized_rows = list(norm._normalize_row(row, {}, ("table",), _r_lvl=1000, is_root=True))
# all non-root rows must have:
non_root = [r for r in normalized_rows if r[0][1] is not None]
assert all(r[1]["_dlt_root_id"] == "###" for r in non_root)
@@ -553,7 +559,7 @@ def test_propagates_table_context(
# to reproduce a bug where rows with _dlt_id set were not extended
row["lvl1"][0]["_dlt_id"] = "row_id_lvl1" # type: ignore[index]
- normalized_rows = list(norm._normalize_row(row, {}, ("table",)))
+ normalized_rows = list(norm._normalize_row(row, {}, ("table",), _r_lvl=1000, is_root=True))
non_root = [r for r in normalized_rows if r[0][1] is not None]
# _dlt_root_id in all non root
assert all(r[1]["_dlt_root_id"] == "###" for r in non_root)
@@ -585,7 +591,7 @@ def test_propagates_table_context_to_lists(norm: RelationalNormalizer) -> None:
prop_config["root"][TColumnName("timestamp")] = TColumnName("_partition_ts")
row = {"_dlt_id": "###", "timestamp": 12918291.1212, "lvl1": [1, 2, 3, [4, 5, 6]]}
- normalized_rows = list(norm._normalize_row(row, {}, ("table",)))
+ normalized_rows = list(norm._normalize_row(row, {}, ("table",), _r_lvl=1000, is_root=True))
# _partition_ts == timestamp on all child tables
non_root = [r for r in normalized_rows if r[0][1] is not None]
assert all(r[1]["_partition_ts"] == 12918291.1212 for r in non_root)
@@ -598,7 +604,7 @@ def test_removes_normalized_list(norm: RelationalNormalizer) -> None:
# after normalizing the list that got normalized into child table must be deleted
row = {"comp": [{"_timestamp": "a"}]}
# get iterator
- normalized_rows_i = norm._normalize_row(row, {}, ("table",))
+ normalized_rows_i = norm._normalize_row(row, {}, ("table",), _r_lvl=1000, is_root=True)
# yield just one item
root_row = next(normalized_rows_i)
# root_row = next(r for r in normalized_rows if r[0][1] is None)
@@ -622,7 +628,7 @@ def test_preserves_json_types_list(norm: RelationalNormalizer) -> None:
)
)
row = {"value": ["from", {"json": True}]}
- normalized_rows = list(norm._normalize_row(row, {}, ("event_slot",)))
+ normalized_rows = list(norm._normalize_row(row, {}, ("event_slot",), _r_lvl=1000, is_root=True))
# make sure only 1 row is emitted, the list is not normalized
assert len(normalized_rows) == 1
# value is kept in root row -> market as json
@@ -631,7 +637,7 @@ def test_preserves_json_types_list(norm: RelationalNormalizer) -> None:
# same should work for a list
row = {"value": ["from", ["json", True]]} # type: ignore[list-item]
- normalized_rows = list(norm._normalize_row(row, {}, ("event_slot",)))
+ normalized_rows = list(norm._normalize_row(row, {}, ("event_slot",), _r_lvl=1000, is_root=True))
# make sure only 1 row is emitted, the list is not normalized
assert len(normalized_rows) == 1
# value is kept in root row -> market as json
@@ -884,7 +890,7 @@ def test_caching_perf(norm: RelationalNormalizer) -> None:
table["x-normalizer"] = {}
start = time()
for _ in range(100000):
- norm._is_nested_type(norm.schema, "test", "field", 0, 0)
+ norm._is_nested_type(norm.schema, "test", "field", 0)
# norm._get_table_nesting_level(norm.schema, "test")
print(f"{time() - start}")
diff --git a/tests/common/runtime/test_run_context.py b/tests/common/runtime/test_run_context.py
index 84047b1b06..dd96a8129f 100644
--- a/tests/common/runtime/test_run_context.py
+++ b/tests/common/runtime/test_run_context.py
@@ -7,7 +7,7 @@
from dlt.common.configuration.container import Container
from dlt.common.configuration.specs import RuntimeConfiguration, PluggableRunContext
from dlt.common.runtime.init import _INITIALIZED, apply_runtime_config, restore_run_context
-from dlt.common.runtime.run_context import RunContext
+from dlt.common.runtime.run_context import RunContext, is_folder_writable
from tests.utils import MockableRunContext
@@ -128,3 +128,9 @@ def test_context_switch_restores_logger() -> None:
with Container().injectable_context(ctx):
assert logger.LOGGER.name == "dlt-tests-2"
assert logger.LOGGER.name == "dlt-tests"
+
+
+def test_tmp_folder_writable() -> None:
+ import tempfile
+
+ assert is_folder_writable(tempfile.gettempdir()) is True
diff --git a/tests/common/storages/test_schema_storage.py b/tests/common/storages/test_schema_storage.py
index a813805ca0..0dcf2930de 100644
--- a/tests/common/storages/test_schema_storage.py
+++ b/tests/common/storages/test_schema_storage.py
@@ -480,6 +480,19 @@ def test_live_schema_getter_when_committed(live_storage: LiveSchemaStorage) -> N
assert id(set_schema_2) == id(set_schema)
+def test_live_schema_getter_when_committed_not_saved(live_storage: LiveSchemaStorage) -> None:
+ # getter on committed is aware of changes to storage (also import)
+ schema = Schema("simple")
+ live_storage.set_live_schema(schema)
+ set_schema = live_storage["simple"]
+ assert schema is set_schema
+ # clear is modified flag so schema is not committed without saving
+ schema._bump_version()
+ assert not schema.is_modified
+ set_schema = live_storage["simple"]
+ assert set_schema is schema
+
+
def test_new_live_schema_committed(live_storage: LiveSchemaStorage) -> None:
with pytest.raises(SchemaNotFoundError):
live_storage.is_live_schema_committed("simple")
diff --git a/tests/common/test_time.py b/tests/common/test_time.py
index 5e591061eb..8c25983d46 100644
--- a/tests/common/test_time.py
+++ b/tests/common/test_time.py
@@ -11,6 +11,7 @@
ensure_pendulum_date,
datetime_to_timestamp,
datetime_to_timestamp_ms,
+ detect_datetime_format,
)
from dlt.common.typing import TAnyDateTime
@@ -124,3 +125,43 @@ def test_datetime_to_timestamp_helpers(
) -> None:
assert datetime_to_timestamp(datetime_obj) == timestamp
assert datetime_to_timestamp_ms(datetime_obj) == timestamp_ms
+
+
+@pytest.mark.parametrize(
+ "value, expected_format",
+ [
+ ("2024-10-20T15:30:00Z", "%Y-%m-%dT%H:%M:%SZ"), # UTC 'Z'
+ ("2024-10-20T15:30:00.123456Z", "%Y-%m-%dT%H:%M:%S.%fZ"), # UTC 'Z' with fractional seconds
+ ("2024-10-20T15:30:00+02:00", "%Y-%m-%dT%H:%M:%S%z"), # Timezone offset
+ ("2024-10-20T15:30:00+0200", "%Y-%m-%dT%H:%M:%S%z"), # Timezone without colon
+ ("2024-10-20T15:30:00", "%Y-%m-%dT%H:%M:%S"), # No timezone
+ ("2024-10-20T15:30", "%Y-%m-%dT%H:%M"), # Minute precision
+ ("2024-10-20T15", "%Y-%m-%dT%H"), # Hour precision
+ ("2024-10-20", "%Y-%m-%d"), # Date only
+ ("2024-10", "%Y-%m"), # Year and month
+ ("2024", "%Y"), # Year only
+ ("2024-W42", "%Y-W%W"), # Week-based date
+ ("2024-W42-5", "%Y-W%W-%u"), # Week-based date with day
+ ("2024-293", "%Y-%j"), # Ordinal date
+ ("20241020", "%Y%m%d"), # Compact date format
+ ("202410", "%Y%m"), # Compact year and month format
+ ],
+)
+def test_detect_datetime_format(value, expected_format) -> None:
+ assert detect_datetime_format(value) == expected_format
+ assert ensure_pendulum_datetime(value) is not None
+
+
+@pytest.mark.parametrize(
+ "value",
+ [
+ "invalid-format", # Invalid format
+ "2024/10/32", # Invalid format
+ "2024-10-W", # Invalid week format
+ "2024-10-W42-8", # Invalid day of the week
+ ],
+)
+def test_detect_datetime_format_invalid(value) -> None:
+ assert detect_datetime_format(value) is None
+ with pytest.raises(ValueError):
+ ensure_pendulum_datetime(value)
diff --git a/tests/conftest.py b/tests/conftest.py
index e5cf74fe35..6088fa976c 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -117,6 +117,9 @@ def _create_pipeline_instance_id(self) -> str:
# disable httpx request logging (too verbose when testing qdrant)
logging.getLogger("httpx").setLevel("WARNING")
+ # disable googleapiclient logging
+ logging.getLogger("googleapiclient.discovery_cache").setLevel("WARNING")
+
# reset and init airflow db
import warnings
diff --git a/tests/destinations/test_readable_dbapi_dataset.py b/tests/destinations/test_readable_dbapi_dataset.py
new file mode 100644
index 0000000000..0e01bb5267
--- /dev/null
+++ b/tests/destinations/test_readable_dbapi_dataset.py
@@ -0,0 +1,120 @@
+"""Unit tests for readable db api dataset and relation"""
+import dlt
+import pytest
+
+from dlt.destinations.dataset import (
+ ReadableRelationHasQueryException,
+ ReadableRelationUnknownColumnException,
+)
+
+
+def test_query_builder() -> None:
+ dataset = dlt.pipeline(destination="duckdb", pipeline_name="pipeline")._dataset()
+
+ # default query for a table
+ assert dataset.my_table.query.strip() == 'SELECT * FROM "pipeline_dataset"."my_table"' # type: ignore[attr-defined]
+
+ # head query
+ assert (
+ dataset.my_table.head().query.strip() # type: ignore[attr-defined]
+ == 'SELECT * FROM "pipeline_dataset"."my_table" LIMIT 5'
+ )
+
+ # limit query
+ assert (
+ dataset.my_table.limit(24).query.strip() # type: ignore[attr-defined]
+ == 'SELECT * FROM "pipeline_dataset"."my_table" LIMIT 24'
+ )
+
+ # select columns
+ assert (
+ dataset.my_table.select("col1", "col2").query.strip() # type: ignore[attr-defined]
+ == 'SELECT "col1","col2" FROM "pipeline_dataset"."my_table"'
+ )
+ # also indexer notation
+ assert (
+ dataset.my_table[["col1", "col2"]].query.strip() # type: ignore[attr-defined]
+ == 'SELECT "col1","col2" FROM "pipeline_dataset"."my_table"'
+ )
+
+ # identifiers are normalized
+ assert (
+ dataset["MY_TABLE"].select("CoL1", "cOl2").query.strip() # type: ignore[attr-defined]
+ == 'SELECT "co_l1","c_ol2" FROM "pipeline_dataset"."my_table"'
+ )
+ assert (
+ dataset["MY__TABLE"].select("Co__L1", "cOl2").query.strip() # type: ignore[attr-defined]
+ == 'SELECT "co__l1","c_ol2" FROM "pipeline_dataset"."my__table"'
+ )
+
+ # limit and select chained
+ assert (
+ dataset.my_table.select("col1", "col2").limit(24).query.strip() # type: ignore[attr-defined]
+ == 'SELECT "col1","col2" FROM "pipeline_dataset"."my_table" LIMIT 24'
+ )
+
+
+def test_copy_and_chaining() -> None:
+ dataset = dlt.pipeline(destination="duckdb", pipeline_name="pipeline")._dataset()
+
+ # create releation and set some stuff on it
+ relation = dataset.items
+ relation = relation.limit(34)
+ relation = relation[["one", "two"]]
+ relation._schema_columns = {"one": {}, "two": {}} # type: ignore[attr-defined]
+
+ relation2 = relation.__copy__()
+ assert relation != relation2
+ assert relation._limit == relation2._limit # type: ignore[attr-defined]
+ assert relation._table_name == relation2._table_name # type: ignore[attr-defined]
+ assert relation._provided_query == relation2._provided_query # type: ignore[attr-defined]
+ assert relation._selected_columns == relation2._selected_columns # type: ignore[attr-defined]
+
+ # test copy while chaining limit
+ relation3 = relation2.limit(22)
+ assert relation2 != relation3
+ assert relation2._limit != relation3._limit # type: ignore[attr-defined]
+
+ # test last setting prevails chaining
+ assert relation.limit(23).limit(67).limit(11)._limit == 11 # type: ignore[attr-defined]
+
+
+def test_computed_schema_columns() -> None:
+ dataset = dlt.pipeline(destination="duckdb", pipeline_name="pipeline")._dataset()
+ relation = dataset.items
+
+ # no schema present
+ assert relation.columns_schema is None
+
+ # we can select any columns because it can't be verified
+ relation["one", "two"]
+
+ # now add columns
+ relation = dataset.items
+ dataset.schema.tables["items"] = { # type: ignore[attr-defined]
+ "columns": {"one": {"data_type": "text"}, "two": {"data_type": "json"}}
+ }
+
+ # computed columns are same as above
+ assert relation.columns_schema == {"one": {"data_type": "text"}, "two": {"data_type": "json"}}
+
+ # when selecting only one column, computing schema columns will only show that one
+ assert relation.select("one").columns_schema == {"one": {"data_type": "text"}}
+
+ # selecting unkonwn column fails
+ with pytest.raises(ReadableRelationUnknownColumnException):
+ relation["unknown_columns"]
+
+
+def test_prevent_changing_relation_with_query() -> None:
+ dataset = dlt.pipeline(destination="duckdb", pipeline_name="pipeline")._dataset()
+ relation = dataset("SELECT * FROM something")
+
+ with pytest.raises(ReadableRelationHasQueryException):
+ relation.limit(5)
+
+ with pytest.raises(ReadableRelationHasQueryException):
+ relation.head()
+
+ with pytest.raises(ReadableRelationHasQueryException):
+ relation.select("hello", "hillo")
diff --git a/tests/extract/test_incremental.py b/tests/extract/test_incremental.py
index 0a0de75987..7ce4228b6c 100644
--- a/tests/extract/test_incremental.py
+++ b/tests/extract/test_incremental.py
@@ -2,7 +2,7 @@
import inspect
import os
import random
-from datetime import datetime # noqa: I251
+from datetime import datetime, date # noqa: I251
from itertools import chain, count
from time import sleep
from typing import Any, Optional
@@ -34,10 +34,12 @@
IncrementalCursorPathMissing,
IncrementalPrimaryKeyMissing,
)
+from dlt.extract.incremental.lag import apply_lag
from dlt.extract.items import ValidateItem
from dlt.extract.resource import DltResource
from dlt.pipeline.exceptions import PipelineStepFailed
from dlt.sources.helpers.transform import take_first
+
from tests.extract.utils import AssertItems, data_item_to_list
from tests.pipeline.utils import assert_query_data
from tests.utils import (
@@ -2586,3 +2588,983 @@ def updated_is_int(updated_at=dlt.sources.incremental("updated_at", initial_valu
pipeline.run(updated_is_int())
assert isinstance(pip_ex.value.__cause__, IncrementalCursorInvalidCoercion)
assert pip_ex.value.__cause__.cursor_path == "updated_at"
+
+
+def test_incremental_merge_native_representation():
+ incremental = Incremental(cursor_path="some_path", lag=10) # type: ignore
+
+ native_value = Incremental(cursor_path="another_path", lag=5) # type: ignore
+
+ incremental.parse_native_representation(native_value)
+
+ # Assert the expected changes in the incremental object
+ assert incremental.cursor_path == "another_path"
+ assert incremental.lag == 5
+
+
+@pytest.mark.parametrize("lag", [0, 1, 100, 200, 1000])
+@pytest.mark.parametrize("last_value_func", [min, max])
+def test_incremental_lag_int(lag: float, last_value_func) -> None:
+ """
+ Test incremental lag behavior for int data while using `id` as the primary key using append write disposition.
+ """
+
+ pipeline = dlt.pipeline(
+ pipeline_name=uniq_id(),
+ destination=dlt.destinations.duckdb(credentials=duckdb.connect(":memory:")),
+ )
+
+ name = "events"
+ is_second_run = False
+ is_third_run = False
+
+ @dlt.resource(name=name, primary_key="id", write_disposition="append")
+ def events_resource(_=dlt.sources.incremental("id", lag=lag, last_value_func=last_value_func)):
+ nonlocal is_second_run
+ nonlocal is_third_run
+
+ initial_entries = [
+ {"id": 100, "event": "100"},
+ {"id": 200, "event": "200"},
+ {"id": 300, "event": "300"},
+ ]
+
+ second_run_events = [
+ {"id": 100, "event": "100_updated_1"},
+ {"id": 200, "event": "200_updated_1"},
+ {"id": 300, "event": "300_updated_1"},
+ {"id": 400, "event": "400"},
+ ]
+
+ third_run_events = [
+ {"id": 100, "event": "100_updated_2"},
+ {"id": 200, "event": "200_updated_2"},
+ {"id": 300, "event": "300_updated_2"},
+ {"id": 400, "event": "400_updated_2"},
+ {"id": 500, "event": "500"},
+ ]
+
+ if is_second_run:
+ yield from second_run_events
+ elif is_third_run:
+ yield from third_run_events
+ else:
+ yield from initial_entries
+
+ # Run the pipeline three times
+ pipeline.run(events_resource)
+ is_second_run = True
+ pipeline.run(events_resource)
+ is_second_run = False
+ is_third_run = True
+ pipeline.run(events_resource)
+
+ # Results using APPEND write disposition
+ # Expected results based on `last_value_func`
+ if last_value_func == max:
+ expected_results = {
+ 1000: [
+ "100",
+ "200",
+ "300",
+ "100_updated_1",
+ "200_updated_1",
+ "300_updated_1",
+ "400",
+ "100_updated_2",
+ "200_updated_2",
+ "300_updated_2",
+ "400_updated_2",
+ "500",
+ ],
+ 200: [
+ "100",
+ "200",
+ "300",
+ "100_updated_1",
+ "200_updated_1",
+ "300_updated_1",
+ "400",
+ "200_updated_2",
+ "300_updated_2",
+ "400_updated_2",
+ "500",
+ ],
+ 100: [
+ "100",
+ "200",
+ "300",
+ "200_updated_1",
+ "300_updated_1",
+ "400",
+ "300_updated_2",
+ "400_updated_2",
+ "500",
+ ],
+ 1: ["100", "200", "300", "300_updated_1", "400", "400_updated_2", "500"],
+ 0: ["100", "200", "300", "400", "500"],
+ }
+ else:
+ expected_results = {
+ 1000: [
+ "100",
+ "200",
+ "300",
+ "100_updated_1",
+ "200_updated_1",
+ "300_updated_1",
+ "400",
+ "100_updated_2",
+ "200_updated_2",
+ "300_updated_2",
+ "400_updated_2",
+ "500",
+ ],
+ 200: [
+ "100",
+ "200",
+ "300",
+ "100_updated_1",
+ "200_updated_1",
+ "300_updated_1",
+ "100_updated_2",
+ "200_updated_2",
+ "300_updated_2",
+ ],
+ 100: [
+ "100",
+ "200",
+ "300",
+ "100_updated_1",
+ "200_updated_1",
+ "100_updated_2",
+ "200_updated_2",
+ ],
+ 1: ["100", "200", "300", "100_updated_1", "100_updated_2"],
+ 0: ["100", "200", "300"],
+ }
+
+ with pipeline.sql_client() as sql_client:
+ result = [
+ row[0]
+ for row in sql_client.execute_sql(f"SELECT event FROM {name} ORDER BY _dlt_load_id, id")
+ ]
+ assert result == expected_results[int(lag)]
+
+
+@pytest.mark.parametrize("lag", [7200, 3601, 3600, 60, 0])
+@pytest.mark.parametrize("last_value_func", [min, max])
+def test_incremental_lag_datetime_str(lag: float, last_value_func) -> None:
+ """
+ Test incremental lag behavior for datetime data while using `id` as the primary key using merge write disposition.
+ """
+
+ pipeline = dlt.pipeline(
+ pipeline_name=uniq_id(),
+ destination=dlt.destinations.duckdb(credentials=duckdb.connect(":memory:")),
+ )
+
+ name = "events"
+ is_second_run = False
+ is_third_run = False
+
+ @dlt.resource(name=name, primary_key="id", write_disposition="merge")
+ def events_resource(
+ _=dlt.sources.incremental("created_at", lag=lag, last_value_func=last_value_func)
+ ):
+ nonlocal is_second_run
+ nonlocal is_third_run
+
+ initial_entries = [
+ {"id": 1, "created_at": "2023-03-03T01:00:00Z", "event": "1"},
+ {"id": 2, "created_at": "2023-03-03T01:00:01Z", "event": "2"},
+ {"id": 3, "created_at": "2023-03-03T02:00:01Z", "event": "3"},
+ ]
+
+ second_run_events = [
+ {"id": 1, "created_at": "2023-03-03T01:00:00Z", "event": "1_updated_1"},
+ {"id": 2, "created_at": "2023-03-03T01:00:01Z", "event": "2_updated_1"},
+ {"id": 3, "created_at": "2023-03-03T02:00:01Z", "event": "3_updated_1"},
+ {"id": 4, "created_at": "2023-03-03T03:00:00Z", "event": "4"},
+ ]
+
+ third_run_events = [
+ {"id": 1, "created_at": "2023-03-03T01:00:00Z", "event": "1_updated_2"},
+ {"id": 2, "created_at": "2023-03-03T01:00:01Z", "event": "2_updated_2"},
+ {"id": 3, "created_at": "2023-03-03T02:00:01Z", "event": "3_updated_2"},
+ {"id": 4, "created_at": "2023-03-03T03:00:00Z", "event": "4_updated_2"},
+ {"id": 5, "created_at": "2023-03-03T03:00:00Z", "event": "5"},
+ ]
+
+ if is_second_run:
+ yield from second_run_events
+ elif is_third_run:
+ yield from third_run_events
+ else:
+ yield from initial_entries
+
+ # Run the pipeline three times
+ pipeline.run(events_resource)
+ is_second_run = True
+ pipeline.run(events_resource)
+ is_second_run = False
+ is_third_run = True
+ pipeline.run(events_resource)
+
+ # Results using MERGE write disposition
+ # Expected results based on `last_value_func`
+ if last_value_func == max:
+ expected_results = {
+ 7200: ["1_updated_2", "2_updated_2", "3_updated_2", "4_updated_2", "5"],
+ 3601: ["1_updated_1", "2_updated_1", "3_updated_2", "4_updated_2", "5"],
+ 3600: ["1", "2_updated_1", "3_updated_2", "4_updated_2", "5"],
+ 60: ["1", "2", "3_updated_1", "4_updated_2", "5"],
+ 0: ["1", "2", "3", "4", "5"],
+ }
+ else:
+ expected_results = {
+ 7200: ["1_updated_2", "2_updated_2", "3_updated_2", "4_updated_2", "5"],
+ 3601: ["1_updated_2", "2_updated_2", "3_updated_2"],
+ 3600: ["3", "1_updated_2", "2_updated_2"],
+ 60: ["3", "1_updated_2", "2_updated_2"],
+ 0: ["1", "2", "3"],
+ }
+
+ with pipeline.sql_client() as sql_client:
+ result = [
+ row[0]
+ for row in sql_client.execute_sql(f"SELECT event FROM {name} ORDER BY _dlt_load_id, id")
+ ]
+ assert result == expected_results[int(lag)]
+
+
+@pytest.mark.parametrize("lag", [3601, 3600, 60, 0])
+def test_incremental_lag_disabled_with_custom_last_value_func(lag: float) -> None:
+ """
+ Test incremental lag is disabled when not using min or max as incremental last_value_func
+ """
+
+ pipeline = dlt.pipeline(
+ pipeline_name=uniq_id(),
+ destination=dlt.destinations.duckdb(credentials=duckdb.connect(":memory:")),
+ )
+
+ name = "events"
+ is_second_run = False
+
+ def custom_function(values):
+ return max(values)
+
+ @dlt.resource(name=name, primary_key="id", write_disposition="append")
+ def events_resource(_=dlt.sources.incremental("id", lag=lag, last_value_func=custom_function)):
+ nonlocal is_second_run
+
+ initial_entries = [
+ {"id": 100, "event": "100"},
+ {"id": 200, "event": "200"},
+ {"id": 300, "event": "300"},
+ ]
+
+ second_run_events = [
+ {"id": 100, "event": "100_updated_1"},
+ {"id": 200, "event": "200_updated_1"},
+ {"id": 300, "event": "300_updated_1"},
+ {"id": 400, "event": "400"},
+ ]
+
+ yield second_run_events if is_second_run else initial_entries
+
+ # Run the pipeline three times
+ pipeline.run(events_resource)
+ is_second_run = True
+ pipeline.run(events_resource)
+
+ with pipeline.sql_client() as sql_client:
+ result = [
+ row[0]
+ for row in sql_client.execute_sql(f"SELECT event FROM {name} ORDER BY _dlt_load_id, id")
+ ]
+ assert result == ["100", "200", "300", "400"]
+
+
+@pytest.mark.parametrize("lag", [-3601, -3600, -60, 0])
+@pytest.mark.parametrize("end_value", [-1, 0, 500])
+def test_incremental_lag_disabled_with_end_values(lag: float, end_value: float) -> None:
+ """
+ Test incremental lag is disabled when not using end_value
+ """
+
+ pipeline = dlt.pipeline(
+ pipeline_name=uniq_id(),
+ destination=dlt.destinations.duckdb(credentials=duckdb.connect(":memory:")),
+ )
+
+ name = "events"
+ is_second_run = False
+
+ @dlt.resource(name=name, primary_key="id", write_disposition="append")
+ def events_resource(
+ _=dlt.sources.incremental("id", lag=lag, initial_value=-450, end_value=end_value)
+ ):
+ nonlocal is_second_run
+
+ # prepare negative ids so for all end_values we load the table with cutoff at -450
+ # lag, if present would skip values even from initial load (lag==-3600)
+ initial_entries = [
+ {"id": -100, "event": "100"},
+ {"id": -200, "event": "200"},
+ {"id": -300, "event": "300"},
+ ]
+
+ second_run_events = [
+ {"id": -100, "event": "100_updated_1"},
+ {"id": -200, "event": "200_updated_1"},
+ {"id": -300, "event": "300_updated_1"},
+ {"id": -400, "event": "400"},
+ {"id": -500, "event": "500"},
+ {"id": -600, "event": "600"},
+ {"id": -700, "event": "700"},
+ ]
+
+ yield second_run_events if is_second_run else initial_entries
+
+ # Run the pipeline three times
+ pipeline.run(events_resource)
+ is_second_run = True
+ pipeline.run(events_resource)
+
+ with pipeline.sql_client() as sql_client:
+ result = [
+ row[0]
+ for row in sql_client.execute_sql(
+ f"SELECT event FROM {name} ORDER BY _dlt_load_id ASC, id DESC"
+ )
+ ]
+ assert result == [
+ "100",
+ "200",
+ "300",
+ "100_updated_1",
+ "200_updated_1",
+ "300_updated_1",
+ "400",
+ ]
+
+
+@pytest.mark.parametrize("lag", [3, 2, 1, 0]) # Lag in days
+@pytest.mark.parametrize("last_value_func", [min, max])
+def test_incremental_lag_date_str(lag: int, last_value_func) -> None:
+ """
+ Test incremental lag behavior for date data while using `id` as the primary key using merge write disposition.
+ """
+
+ pipeline = dlt.pipeline(
+ pipeline_name=uniq_id(),
+ destination=dlt.destinations.duckdb(credentials=duckdb.connect(":memory:")),
+ )
+
+ name = "events"
+ is_second_run = False
+ is_third_run = False
+
+ @dlt.resource(name=name, primary_key="id", write_disposition="append")
+ def events_resource(
+ _=dlt.sources.incremental("created_at", lag=lag, last_value_func=last_value_func)
+ ):
+ nonlocal is_second_run
+ nonlocal is_third_run
+
+ initial_entries = [
+ {"id": 1, "created_at": "2023-03-01", "event": "1"},
+ {"id": 2, "created_at": "2023-03-02", "event": "2"},
+ {"id": 3, "created_at": "2023-03-03", "event": "3"},
+ ]
+
+ second_run_events = [
+ {"id": 1, "created_at": "2023-03-01", "event": "1_updated_1"},
+ {"id": 2, "created_at": "2023-03-02", "event": "2_updated_1"},
+ {"id": 3, "created_at": "2023-03-03", "event": "3_updated_1"},
+ {"id": 4, "created_at": "2023-03-04", "event": "4"},
+ ]
+
+ third_run_events = [
+ {"id": 1, "created_at": "2023-03-01", "event": "1_updated_2"},
+ {"id": 2, "created_at": "2023-03-02", "event": "2_updated_2"},
+ {"id": 3, "created_at": "2023-03-03", "event": "3_updated_2"},
+ {"id": 4, "created_at": "2023-03-04", "event": "4_updated_2"},
+ {"id": 5, "created_at": "2023-03-05", "event": "5"},
+ ]
+
+ if is_second_run:
+ yield from second_run_events
+ elif is_third_run:
+ yield from third_run_events
+ else:
+ yield from initial_entries
+
+ # Run the pipeline three times
+ pipeline.run(events_resource)
+ is_second_run = True
+ pipeline.run(events_resource)
+ is_second_run = False
+ is_third_run = True
+ pipeline.run(events_resource)
+
+ # Expected results based on `last_value_func` and lag (in days)
+ if last_value_func == max:
+ expected_results = {
+ 3: [
+ "1",
+ "2",
+ "3",
+ "1_updated_1",
+ "2_updated_1",
+ "3_updated_1",
+ "4",
+ "1_updated_2",
+ "2_updated_2",
+ "3_updated_2",
+ "4_updated_2",
+ "5",
+ ],
+ 2: [
+ "1",
+ "2",
+ "3",
+ "1_updated_1",
+ "2_updated_1",
+ "3_updated_1",
+ "4",
+ "2_updated_2",
+ "3_updated_2",
+ "4_updated_2",
+ "5",
+ ],
+ 1: [
+ "1",
+ "2",
+ "3",
+ "2_updated_1",
+ "3_updated_1",
+ "4",
+ "3_updated_2",
+ "4_updated_2",
+ "5",
+ ],
+ 0: ["1", "2", "3", "4", "5"],
+ }
+ else:
+ expected_results = {
+ 3: [
+ "1",
+ "2",
+ "3",
+ "1_updated_1",
+ "2_updated_1",
+ "3_updated_1",
+ "4",
+ "1_updated_2",
+ "2_updated_2",
+ "3_updated_2",
+ "4_updated_2",
+ ],
+ 2: [
+ "1",
+ "2",
+ "3",
+ "1_updated_1",
+ "2_updated_1",
+ "3_updated_1",
+ "1_updated_2",
+ "2_updated_2",
+ "3_updated_2",
+ ],
+ 1: ["1", "2", "3", "1_updated_1", "2_updated_1", "1_updated_2", "2_updated_2"],
+ 0: ["1", "2", "3"],
+ }
+
+ with pipeline.sql_client() as sql_client:
+ result = [
+ row[0]
+ for row in sql_client.execute_sql(f"SELECT event FROM {name} ORDER BY _dlt_load_id, id")
+ ]
+ assert result == expected_results[lag]
+
+
+@pytest.mark.parametrize("lag", [3, 2, 1, 0]) # Lag in days
+@pytest.mark.parametrize("last_value_func", [min, max])
+def test_incremental_lag_date_datetime(lag: int, last_value_func) -> None:
+ """
+ Test incremental lag behavior for date data while using `id` as the primary key using merge write disposition.
+ """
+
+ pipeline = dlt.pipeline(
+ pipeline_name=uniq_id(),
+ destination=dlt.destinations.duckdb(credentials=duckdb.connect(":memory:")),
+ )
+
+ name = "events"
+ is_second_run = False
+ is_third_run = False
+
+ @dlt.resource(name=name, primary_key="id", write_disposition="append")
+ def events_resource(
+ _=dlt.sources.incremental("created_at", lag=lag, last_value_func=last_value_func)
+ ):
+ nonlocal is_second_run
+ nonlocal is_third_run
+
+ initial_entries = [
+ {"id": 1, "created_at": date(2023, 3, 1), "event": "1"},
+ {"id": 2, "created_at": date(2023, 3, 2), "event": "2"},
+ {"id": 3, "created_at": date(2023, 3, 3), "event": "3"},
+ ]
+
+ second_run_events = [
+ {"id": 1, "created_at": date(2023, 3, 1), "event": "1_updated_1"},
+ {"id": 2, "created_at": date(2023, 3, 2), "event": "2_updated_1"},
+ {"id": 3, "created_at": date(2023, 3, 3), "event": "3_updated_1"},
+ {"id": 4, "created_at": date(2023, 3, 4), "event": "4"},
+ ]
+
+ third_run_events = [
+ {"id": 1, "created_at": date(2023, 3, 1), "event": "1_updated_2"},
+ {"id": 2, "created_at": date(2023, 3, 2), "event": "2_updated_2"},
+ {"id": 3, "created_at": date(2023, 3, 3), "event": "3_updated_2"},
+ {"id": 4, "created_at": date(2023, 3, 4), "event": "4_updated_2"},
+ {"id": 5, "created_at": date(2023, 3, 5), "event": "5"},
+ ]
+
+ if is_second_run:
+ yield from second_run_events
+ elif is_third_run:
+ yield from third_run_events
+ else:
+ yield from initial_entries
+
+ # Run the pipeline three times
+ pipeline.run(events_resource)
+ is_second_run = True
+ pipeline.run(events_resource)
+ is_second_run = False
+ is_third_run = True
+ pipeline.run(events_resource)
+
+ # Expected results based on `last_value_func` and lag (in days)
+ if last_value_func == max:
+ expected_results = {
+ 3: [
+ "1",
+ "2",
+ "3",
+ "1_updated_1",
+ "2_updated_1",
+ "3_updated_1",
+ "4",
+ "1_updated_2",
+ "2_updated_2",
+ "3_updated_2",
+ "4_updated_2",
+ "5",
+ ],
+ 2: [
+ "1",
+ "2",
+ "3",
+ "1_updated_1",
+ "2_updated_1",
+ "3_updated_1",
+ "4",
+ "2_updated_2",
+ "3_updated_2",
+ "4_updated_2",
+ "5",
+ ],
+ 1: [
+ "1",
+ "2",
+ "3",
+ "2_updated_1",
+ "3_updated_1",
+ "4",
+ "3_updated_2",
+ "4_updated_2",
+ "5",
+ ],
+ 0: ["1", "2", "3", "4", "5"],
+ }
+ else:
+ expected_results = {
+ 3: [
+ "1",
+ "2",
+ "3",
+ "1_updated_1",
+ "2_updated_1",
+ "3_updated_1",
+ "4",
+ "1_updated_2",
+ "2_updated_2",
+ "3_updated_2",
+ "4_updated_2",
+ ],
+ 2: [
+ "1",
+ "2",
+ "3",
+ "1_updated_1",
+ "2_updated_1",
+ "3_updated_1",
+ "1_updated_2",
+ "2_updated_2",
+ "3_updated_2",
+ ],
+ 1: ["1", "2", "3", "1_updated_1", "2_updated_1", "1_updated_2", "2_updated_2"],
+ 0: ["1", "2", "3"],
+ }
+
+ with pipeline.sql_client() as sql_client:
+ result = [
+ row[0]
+ for row in sql_client.execute_sql(f"SELECT event FROM {name} ORDER BY _dlt_load_id, id")
+ ]
+ assert result == expected_results[lag]
+
+
+@pytest.mark.parametrize("lag", [200, 1000])
+@pytest.mark.parametrize("last_value_func", [min, max])
+def test_incremental_lag_int_with_initial_values(lag: float, last_value_func) -> None:
+ """
+ Test incremental lag behavior with initial_values for int data while using `id` as the primary key using append write disposition.
+ """
+
+ pipeline = dlt.pipeline(
+ pipeline_name=uniq_id(),
+ destination=dlt.destinations.duckdb(credentials=duckdb.connect(":memory:")),
+ )
+
+ name = "events"
+ is_second_run = False
+ is_third_run = False
+
+ @dlt.resource(name=name, primary_key="id", write_disposition="append")
+ def events_resource(
+ _=dlt.sources.incremental("id", lag=lag, initial_value=200, last_value_func=last_value_func)
+ ):
+ nonlocal is_second_run
+ nonlocal is_third_run
+
+ initial_entries = [
+ {"id": 100, "event": "100"},
+ {"id": 200, "event": "200"},
+ {"id": 300, "event": "300"},
+ ]
+
+ second_run_events = [
+ {"id": 100, "event": "100_updated_1"},
+ {"id": 200, "event": "200_updated_1"},
+ {"id": 300, "event": "300_updated_1"},
+ {"id": 400, "event": "400"},
+ ]
+
+ third_run_events = [
+ {"id": 100, "event": "100_updated_2"},
+ {"id": 200, "event": "200_updated_2"},
+ {"id": 300, "event": "300_updated_2"},
+ {"id": 400, "event": "400_updated_2"},
+ {"id": 500, "event": "500"},
+ ]
+
+ if is_second_run:
+ yield from second_run_events
+ elif is_third_run:
+ yield from third_run_events
+ else:
+ yield from initial_entries
+
+ # Run the pipeline three times
+ pipeline.run(events_resource)
+ is_second_run = True
+ pipeline.run(events_resource)
+ is_second_run = False
+ is_third_run = True
+ pipeline.run(events_resource)
+
+ # Results using APPEND write disposition
+ # Expected results based on `last_value_func`
+ if last_value_func == max:
+ expected_results = {
+ 1000: [
+ "200",
+ "300",
+ "200_updated_1",
+ "300_updated_1",
+ "400",
+ "200_updated_2",
+ "300_updated_2",
+ "400_updated_2",
+ "500",
+ ],
+ 200: [
+ "200",
+ "300",
+ "200_updated_1",
+ "300_updated_1",
+ "400",
+ "200_updated_2",
+ "300_updated_2",
+ "400_updated_2",
+ "500",
+ ],
+ }
+ else:
+ expected_results = {
+ 1000: [
+ "100",
+ "200",
+ "100_updated_1",
+ "200_updated_1",
+ "100_updated_2",
+ "200_updated_2",
+ ],
+ 200: [
+ "100",
+ "200",
+ "100_updated_1",
+ "200_updated_1",
+ "100_updated_2",
+ "200_updated_2",
+ ],
+ }
+
+ with pipeline.sql_client() as sql_client:
+ result = [
+ row[0]
+ for row in sql_client.execute_sql(f"SELECT event FROM {name} ORDER BY _dlt_load_id, id")
+ ]
+ assert result == expected_results[int(lag)]
+
+
+@pytest.mark.parametrize("lag", [0, 1.0, 1.5, 2.0])
+@pytest.mark.parametrize("last_value_func", [min, max])
+def test_incremental_lag_float(lag: float, last_value_func) -> None:
+ """
+ Test incremental lag behavior for int data while using `id` as the primary key using append write disposition.
+ """
+
+ pipeline = dlt.pipeline(
+ pipeline_name=uniq_id(),
+ destination=dlt.destinations.duckdb(credentials=duckdb.connect(":memory:")),
+ )
+
+ name = "events"
+ is_second_run = False
+ is_third_run = False
+
+ @dlt.resource(name=name, primary_key="id", write_disposition="append")
+ def events_resource(_=dlt.sources.incremental("id", lag=lag, last_value_func=last_value_func)):
+ nonlocal is_second_run
+ nonlocal is_third_run
+
+ initial_entries = [
+ {"id": 1.0, "event": "1"},
+ {"id": 2.0, "event": "2"},
+ ]
+
+ second_run_events = [
+ {"id": 1.0, "event": "1_updated_1"},
+ {"id": 2.0, "event": "2_updated_1"},
+ {"id": 2.5, "event": "2-5"},
+ ]
+
+ third_run_events = [
+ {"id": 1.0, "event": "1_updated_2"},
+ {"id": 2.0, "event": "2_updated_2"},
+ {"id": 2.5, "event": "2-5_updated_2"},
+ {"id": 3.0, "event": "3"},
+ ]
+
+ if is_second_run:
+ yield from second_run_events
+ elif is_third_run:
+ yield from third_run_events
+ else:
+ yield from initial_entries
+
+ # Run the pipeline three times
+ pipeline.run(events_resource)
+ is_second_run = True
+ pipeline.run(events_resource)
+ is_second_run = False
+ is_third_run = True
+ pipeline.run(events_resource)
+
+ # Results using APPEND write disposition
+ # Expected results based on `last_value_func`
+ if last_value_func == max:
+ expected_results = {
+ 2.0: [
+ "1",
+ "2",
+ "1_updated_1",
+ "2_updated_1",
+ "2-5",
+ "1_updated_2",
+ "2_updated_2",
+ "2-5_updated_2",
+ "3",
+ ],
+ 1.5: [
+ "1",
+ "2",
+ "1_updated_1",
+ "2_updated_1",
+ "2-5",
+ "1_updated_2",
+ "2_updated_2",
+ "2-5_updated_2",
+ "3",
+ ],
+ 1.0: [
+ "1",
+ "2",
+ "1_updated_1",
+ "2_updated_1",
+ "2-5",
+ "2_updated_2",
+ "2-5_updated_2",
+ "3",
+ ],
+ 0: ["1", "2", "2-5", "3"],
+ }
+ else:
+ expected_results = {
+ 2.0: [
+ "1",
+ "2",
+ "1_updated_1",
+ "2_updated_1",
+ "2-5",
+ "1_updated_2",
+ "2_updated_2",
+ "2-5_updated_2",
+ "3",
+ ],
+ 1.5: [
+ "1",
+ "2",
+ "1_updated_1",
+ "2_updated_1",
+ "2-5",
+ "1_updated_2",
+ "2_updated_2",
+ "2-5_updated_2",
+ ],
+ 1.0: ["1", "2", "1_updated_1", "2_updated_1", "1_updated_2", "2_updated_2"],
+ 0: ["1", "2"],
+ }
+
+ with pipeline.sql_client() as sql_client:
+ result = [
+ row[0]
+ for row in sql_client.execute_sql(f"SELECT event FROM {name} ORDER BY _dlt_load_id, id")
+ ]
+ assert result == expected_results[lag]
+
+
+def test_apply_lag() -> None:
+ # test date lag
+ assert apply_lag(1, None, date(2023, 3, 2), max) == date(2023, 3, 1)
+ assert apply_lag(1, None, date(2023, 3, 2), min) == date(2023, 3, 3)
+ # can't go below initial_value
+ assert apply_lag(1, date(2023, 3, 2), date(2023, 3, 2), max) == date(2023, 3, 2)
+ assert apply_lag(-1, date(2023, 3, 2), date(2023, 3, 2), max) == date(2023, 3, 3)
+ # can't go above initial_value
+ assert apply_lag(1, date(2023, 3, 2), date(2023, 3, 2), min) == date(2023, 3, 2)
+ assert apply_lag(-1, date(2023, 3, 2), date(2023, 3, 2), min) == date(2023, 3, 1)
+
+ # test str date lag
+ assert apply_lag(1, None, "2023-03-02", max) == "2023-03-01"
+ assert apply_lag(1, None, "2023-03-02", min) == "2023-03-03"
+ # initial value
+ assert apply_lag(1, "2023-03-01", "2023-03-02", max) == "2023-03-01"
+ assert apply_lag(2, "2023-03-01", "2023-03-02", max) == "2023-03-01"
+
+ assert apply_lag(1, "2023-03-03", "2023-03-02", min) == "2023-03-03"
+ assert apply_lag(2, "2023-03-03", "2023-03-02", min) == "2023-03-03"
+
+ # test datetime lag
+ assert apply_lag(1, None, datetime(2023, 3, 2, 1, 15, 30), max) == datetime(
+ 2023, 3, 2, 1, 15, 29
+ )
+ assert apply_lag(1, None, datetime(2023, 3, 2, 1, 15, 30), min) == datetime(
+ 2023, 3, 2, 1, 15, 31
+ )
+ # initial value
+ assert apply_lag(
+ 1, datetime(2023, 3, 2, 1, 15, 29), datetime(2023, 3, 2, 1, 15, 30), max
+ ) == datetime(2023, 3, 2, 1, 15, 29)
+ assert apply_lag(
+ 2, datetime(2023, 3, 2, 1, 15, 29), datetime(2023, 3, 2, 1, 15, 30), max
+ ) == datetime(2023, 3, 2, 1, 15, 29)
+ assert apply_lag(
+ 1, datetime(2023, 3, 2, 1, 15, 31), datetime(2023, 3, 2, 1, 15, 30), min
+ ) == datetime(2023, 3, 2, 1, 15, 31)
+ assert apply_lag(
+ 2, datetime(2023, 3, 2, 1, 15, 31), datetime(2023, 3, 2, 1, 15, 30), min
+ ) == datetime(2023, 3, 2, 1, 15, 31)
+
+ # datetime str
+ assert apply_lag(1, None, "2023-03-03T01:15:30Z", max) == "2023-03-03T01:15:29Z"
+ assert apply_lag(1, None, "2023-03-03T01:15:30Z", min) == "2023-03-03T01:15:31Z"
+ # initial value
+ assert (
+ apply_lag(1, "2023-03-03T01:15:29Z", "2023-03-03T01:15:30Z", max) == "2023-03-03T01:15:29Z"
+ )
+ assert (
+ apply_lag(2, "2023-03-03T01:15:29Z", "2023-03-03T01:15:30Z", max) == "2023-03-03T01:15:29Z"
+ )
+ assert (
+ apply_lag(1, "2023-03-03T01:15:31Z", "2023-03-03T01:15:30Z", min) == "2023-03-03T01:15:31Z"
+ )
+ assert (
+ apply_lag(2, "2023-03-03T01:15:31Z", "2023-03-03T01:15:30Z", min) == "2023-03-03T01:15:31Z"
+ )
+
+ # int/float
+ assert apply_lag(1, None, 1, max) == 0
+ assert apply_lag(1, None, 1, min) == 2
+ # initial
+ assert apply_lag(1, 0, 1, max) == 0
+ assert apply_lag(2, 0, 1, max) == 0
+ assert apply_lag(1, 2, 1, min) == 2
+ assert apply_lag(2, 2, 1, min) == 2
+
+
+@pytest.mark.parametrize("item_type", ALL_TEST_DATA_ITEM_FORMATS)
+@pytest.mark.parametrize("primary_key", ["id", None])
+def test_warning_large_deduplication_state(item_type: TestDataItemFormat, primary_key, mocker):
+ @dlt.resource(primary_key=primary_key)
+ def some_data(
+ created_at=dlt.sources.incremental("created_at"),
+ ):
+ # Cross the default threshold of 200
+ yield data_to_item_format(
+ item_type,
+ [{"id": i, "created_at": 1} for i in range(201)],
+ )
+ # Second batch adds more items but shouldn't trigger warning
+ yield data_to_item_format(
+ item_type,
+ [{"id": i, "created_at": 1} for i in range(201, 301)],
+ )
+
+ logger_spy = mocker.spy(dlt.common.logger, "warning")
+ p = dlt.pipeline(pipeline_name=uniq_id())
+ p.extract(some_data(1))
+
+ # Verify warning was called exactly once
+ warning_calls = [
+ call for call in logger_spy.call_args_list if "Large number of records" in call.args[0]
+ ]
+ assert len(warning_calls) == 1
diff --git a/tests/helpers/airflow_tests/test_airflow_wrapper.py b/tests/helpers/airflow_tests/test_airflow_wrapper.py
index ac12f70037..69e48733e3 100644
--- a/tests/helpers/airflow_tests/test_airflow_wrapper.py
+++ b/tests/helpers/airflow_tests/test_airflow_wrapper.py
@@ -1,7 +1,7 @@
import os
import pytest
from unittest import mock
-from typing import List
+from typing import Iterator, List
from airflow import DAG
from airflow.decorators import dag
from airflow.operators.python import PythonOperator, get_current_context
@@ -11,7 +11,7 @@
import dlt
from dlt.common import logger, pendulum
-from dlt.common.utils import uniq_id
+from dlt.common.utils import set_working_dir, uniq_id
from dlt.common.normalizers.naming.snake_case import NamingConvention as SnakeCaseNamingConvention
from dlt.helpers.airflow_helper import PipelineTasksGroup, DEFAULT_RETRY_BACKOFF
@@ -21,6 +21,12 @@
from tests.utils import TEST_STORAGE_ROOT
+@pytest.fixture(autouse=True)
+def run_in_storage(autouse_test_storage) -> Iterator[None]:
+ with set_working_dir("_storage"):
+ yield
+
+
DEFAULT_DATE = pendulum.datetime(2023, 4, 18, tz="Europe/Berlin")
default_args = {
@@ -191,7 +197,10 @@ def dag_regular():
dag_def.test()
# we should be able to attach to pipeline state created within Airflow
- pipeline_dag_regular = dlt.attach(pipeline_name="pipeline_dag_regular")
+ pipeline_dag_regular = dlt.attach(
+ pipeline_name="pipeline_dag_regular",
+ destination=dlt.destinations.duckdb(credentials=":pipeline:"),
+ )
pipeline_dag_regular_counts = load_table_counts(
pipeline_dag_regular,
*[t["name"] for t in pipeline_dag_regular.default_schema.data_tables()],
@@ -230,7 +239,10 @@ def dag_decomposed():
assert tasks_list[1].task_id == "pipeline_dag_decomposed.mock_data_source__t1-_t2-_t3"
assert tasks_list[2].task_id == "pipeline_dag_decomposed.mock_data_source__r_isolee"
dag_def.test()
- pipeline_dag_decomposed = dlt.attach(pipeline_name="pipeline_dag_decomposed")
+ pipeline_dag_decomposed = dlt.attach(
+ pipeline_name="pipeline_dag_decomposed",
+ destination=dlt.destinations.duckdb(credentials=quackdb_path),
+ )
pipeline_dag_decomposed_counts = load_table_counts(
pipeline_dag_decomposed,
*[t["name"] for t in pipeline_dag_decomposed.default_schema.data_tables()],
@@ -273,7 +285,14 @@ def dag_regular():
dag_def.test()
- pipeline_dag_regular = dlt.attach(pipeline_name="pipeline_dag_regular")
+ pipeline_dag_regular = dlt.attach(
+ pipeline_name="pipeline_dag_regular",
+ destination=dlt.destinations.duckdb(credentials=quackdb_path),
+ )
+ assert pipeline_dag_regular.first_run is False
+ assert (
+ pipeline_dag_regular.destination.config_params["bound_to_pipeline"] is pipeline_dag_regular
+ )
pipeline_dag_regular_counts = load_table_counts(
pipeline_dag_regular,
*[t["name"] for t in pipeline_dag_regular.default_schema.data_tables()],
@@ -324,7 +343,10 @@ def dag_parallel():
assert len(tasks_list) == 4
dag_def.test()
- pipeline_dag_parallel = dlt.attach(pipeline_name="pipeline_dag_parallel")
+ pipeline_dag_parallel = dlt.attach(
+ pipeline_name="pipeline_dag_parallel",
+ destination=dlt.destinations.duckdb(credentials=quackdb_path),
+ )
results = load_table_counts(
pipeline_dag_parallel,
*[t["name"] for t in pipeline_dag_parallel.default_schema.data_tables()],
@@ -435,7 +457,8 @@ def dag_parallel():
pipeline_dag_parallel = dlt.attach(
pipeline_name=snake_case.normalize_identifier(
dag_def.tasks[i].task_id.replace("pipeline_dag_parallel.", "")[:-2]
- )
+ ),
+ destination=dlt.destinations.duckdb(credentials=quackdb_path),
)
pipeline_dag_decomposed_counts = load_table_counts(
pipeline_dag_parallel,
@@ -491,7 +514,10 @@ def dag_parallel():
dag_def = dag_parallel()
assert len(tasks_list) == 2
dag_def.test()
- pipeline_dag_parallel = dlt.attach(pipeline_name="pipeline_dag_parallel")
+ pipeline_dag_parallel = dlt.attach(
+ pipeline_name="pipeline_dag_parallel",
+ destination=dlt.destinations.duckdb(credentials=quackdb_path),
+ )
pipeline_dag_decomposed_counts = load_table_counts(
pipeline_dag_parallel,
*[t["name"] for t in pipeline_dag_parallel.default_schema.data_tables()],
@@ -946,7 +972,9 @@ def dag_regular():
dag_def: DAG = dag_regular()
dag_def.test()
- pipeline_dag = dlt.attach(pipeline_name="callable_dag")
+ pipeline_dag = dlt.attach(
+ pipeline_name="callable_dag", destination=dlt.destinations.duckdb(credentials=quackdb_path)
+ )
with pipeline_dag.sql_client() as client:
with client.execute_query("SELECT * FROM test_res") as result:
diff --git a/tests/helpers/dbt_tests/test_runner_dbt_versions.py b/tests/helpers/dbt_tests/test_runner_dbt_versions.py
index 1b5874ede1..67908e176c 100644
--- a/tests/helpers/dbt_tests/test_runner_dbt_versions.py
+++ b/tests/helpers/dbt_tests/test_runner_dbt_versions.py
@@ -79,10 +79,10 @@ def test_infer_venv_deps() -> None:
# provide version ranges
requirements = _create_dbt_deps(["duckdb"], dbt_version=">3")
# special duckdb dependency
- assert requirements[:-1] == ["dbt-core>3", "dbt-duckdb", "duckdb==1.1.0"]
+ assert requirements[:-1] == ["dbt-core>3", "dbt-duckdb", "duckdb==1.1.2"]
# we do not validate version ranges, pip will do it and fail when creating venv
requirements = _create_dbt_deps(["motherduck"], dbt_version="y")
- assert requirements[:-1] == ["dbt-corey", "dbt-duckdb", "duckdb==1.1.0"]
+ assert requirements[:-1] == ["dbt-corey", "dbt-duckdb", "duckdb==1.1.2"]
def test_default_profile_name() -> None:
diff --git a/tests/load/clickhouse/test_clickhouse_adapter.py b/tests/load/clickhouse/test_clickhouse_adapter.py
index e8e2b327c0..f36df0ee3f 100644
--- a/tests/load/clickhouse/test_clickhouse_adapter.py
+++ b/tests/load/clickhouse/test_clickhouse_adapter.py
@@ -1,7 +1,6 @@
from typing import Generator, Dict, cast
import dlt
-from dlt.common.utils import custom_environ
from dlt.destinations.adapters import clickhouse_adapter
from dlt.destinations.impl.clickhouse.sql_client import ClickHouseSqlClient
from dlt.destinations.impl.clickhouse.typing import TDeployment
@@ -34,7 +33,12 @@ def not_annotated_resource() -> Generator[Dict[str, int], None, None]:
clickhouse_adapter(merge_tree_resource, table_engine_type="merge_tree")
clickhouse_adapter(replicated_merge_tree_resource, table_engine_type="replicated_merge_tree")
- pipe = dlt.pipeline(pipeline_name="adapter_test", destination="clickhouse", dev_mode=True)
+ pipe = dlt.pipeline(
+ pipeline_name="adapter_test",
+ destination="clickhouse",
+ dev_mode=True,
+ dataset_name="adapter_test_ds",
+ )
with pipe.sql_client() as client:
deployment_type: TDeployment = get_deployment_type(cast(ClickHouseSqlClient, client))
diff --git a/tests/load/databricks/test_databricks_configuration.py b/tests/load/databricks/test_databricks_configuration.py
index bb989a887c..e27da4db2a 100644
--- a/tests/load/databricks/test_databricks_configuration.py
+++ b/tests/load/databricks/test_databricks_configuration.py
@@ -8,7 +8,10 @@
from dlt.common.configuration import resolve_configuration
from dlt.destinations import databricks
-from dlt.destinations.impl.databricks.configuration import DatabricksClientConfiguration
+from dlt.destinations.impl.databricks.configuration import (
+ DatabricksClientConfiguration,
+ DATABRICKS_APPLICATION_ID,
+)
# mark all tests as essential, do not remove
pytestmark = pytest.mark.essential
@@ -37,6 +40,7 @@ def test_databricks_credentials_to_connector_params():
assert params["extra_a"] == "a"
assert params["extra_b"] == "b"
assert params["_socket_timeout"] == credentials.socket_timeout
+ assert params["_user_agent_entry"] == DATABRICKS_APPLICATION_ID
def test_databricks_configuration() -> None:
diff --git a/tests/load/duckdb/test_duckdb_client.py b/tests/load/duckdb/test_duckdb_client.py
index f4088a7608..a9479a0bb9 100644
--- a/tests/load/duckdb/test_duckdb_client.py
+++ b/tests/load/duckdb/test_duckdb_client.py
@@ -5,8 +5,10 @@
import dlt
from dlt.common.configuration.resolve import resolve_configuration
from dlt.common.configuration.utils import get_resolved_traces
-
from dlt.common.destination.reference import Destination
+from dlt.common.utils import set_working_dir
+
+from dlt.destinations.exceptions import DatabaseUndefinedRelation
from dlt.destinations.impl.duckdb.configuration import (
DuckDbClientConfiguration,
DEFAULT_DUCK_DB_NAME,
@@ -15,6 +17,7 @@
from dlt.destinations.impl.duckdb.exceptions import InvalidInMemoryDuckdbCredentials
from dlt.pipeline.exceptions import PipelineStepFailed
+
from tests.pipeline.utils import assert_table
from tests.utils import patch_home_dir, autouse_test_storage, TEST_STORAGE_ROOT
@@ -23,11 +26,12 @@
@pytest.fixture(autouse=True)
-def delete_default_duckdb_credentials() -> Iterator[None]:
+def delete_default_duckdb_credentials(autouse_test_storage) -> Iterator[None]:
# remove the default duckdb config
# os.environ.pop("DESTINATION__DUCKDB__CREDENTIALS", None)
os.environ.clear()
- yield
+ with set_working_dir("_storage"):
+ yield
delete_quack_db()
@@ -102,17 +106,29 @@ def test_duckdb_database_path() -> None:
DuckDbClientConfiguration()._bind_dataset_name(dataset_name="test_dataset")
)
assert c.credentials._conn_str().lower() == os.path.abspath("quack.duckdb").lower()
+
# resolve without any path but with pipeline context
p = dlt.pipeline(pipeline_name="quack_pipeline")
+ # pipeline context must be passed explicitly
c = resolve_configuration(
DuckDbClientConfiguration()._bind_dataset_name(dataset_name="test_dataset")
)
+ assert c.credentials._conn_str().lower() == os.path.abspath("quack.duckdb").lower()
+ # pass explicitly
+ c = resolve_configuration(
+ DuckDbClientConfiguration()._bind_dataset_name(dataset_name="test_dataset"),
+ explicit_value={"bound_to_pipeline": p},
+ )
# still cwd
db_path = os.path.abspath(os.path.join(".", "quack_pipeline.duckdb"))
assert c.credentials._conn_str().lower() == db_path.lower()
- # we do not keep default duckdb path in the local state
- with pytest.raises(KeyError):
- p.get_local_state_val("duckdb_database")
+
+ # must work via factory
+ factory_ = dlt.destinations.duckdb(bound_to_pipeline=p)
+ c = factory_.configuration(
+ DuckDbClientConfiguration()._bind_dataset_name(dataset_name="test_dataset")
+ )
+ assert c.credentials._conn_str().lower() == db_path.lower()
# connect
try:
@@ -123,11 +139,30 @@ def test_duckdb_database_path() -> None:
if os.path.isfile(db_path):
os.unlink(db_path)
+ # must work via pipeline
+ duck_p = dlt.pipeline(pipeline_name="quack_pipeline_exp", destination="duckdb")
+ db_path = os.path.abspath(os.path.join(".", "quack_pipeline_exp.duckdb"))
+ assert duck_p.sql_client().credentials._conn_str().lower() == db_path.lower()
+
+ duck_p = dlt.pipeline(pipeline_name="quack_pipeline_exp", destination=dlt.destinations.duckdb())
+ creds_ = duck_p.sql_client().credentials
+ assert creds_._conn_str().lower() == db_path.lower()
+
+ # connect
+ try:
+ conn = creds_.borrow_conn(read_only=False)
+ creds_.return_conn(conn)
+ assert os.path.isfile(db_path)
+ finally:
+ if os.path.isfile(db_path):
+ os.unlink(db_path)
+
# test special :pipeline: path to create in pipeline folder
c = resolve_configuration(
DuckDbClientConfiguration(credentials=":pipeline:")._bind_dataset_name(
dataset_name="test_dataset"
- )
+ ),
+ explicit_value={"bound_to_pipeline": p}, # not an active pipeline
)
db_path = os.path.abspath(os.path.join(p.working_dir, DEFAULT_DUCK_DB_NAME))
assert c.credentials._conn_str().lower() == db_path.lower()
@@ -135,15 +170,15 @@ def test_duckdb_database_path() -> None:
conn = c.credentials.borrow_conn(read_only=False)
c.credentials.return_conn(conn)
assert os.path.isfile(db_path)
- assert p.get_local_state_val("duckdb_database").lower() == db_path.lower()
p = p.drop()
# provide relative path
- db_path = "_storage/test_quack.duckdb"
+ db_path = "test_quack.duckdb"
c = resolve_configuration(
- DuckDbClientConfiguration(
- credentials="duckdb:///_storage/test_quack.duckdb"
- )._bind_dataset_name(dataset_name="test_dataset")
+ DuckDbClientConfiguration(credentials="duckdb:///test_quack.duckdb")._bind_dataset_name(
+ dataset_name="test_dataset"
+ ),
+ explicit_value={"bound_to_pipeline": p},
)
assert c.credentials._conn_str().lower() == os.path.abspath(db_path).lower()
conn = c.credentials.borrow_conn(read_only=False)
@@ -152,11 +187,12 @@ def test_duckdb_database_path() -> None:
p = p.drop()
# provide absolute path
- db_path = os.path.abspath("_storage/abs_test_quack.duckdb")
+ db_path = os.path.abspath("abs_test_quack.duckdb")
c = resolve_configuration(
DuckDbClientConfiguration(credentials=f"duckdb:///{db_path}")._bind_dataset_name(
dataset_name="test_dataset",
- )
+ ),
+ explicit_value={"bound_to_pipeline": p},
)
assert os.path.isabs(c.credentials.database)
assert c.credentials._conn_str().lower() == db_path.lower()
@@ -166,11 +202,12 @@ def test_duckdb_database_path() -> None:
p = p.drop()
# set just path as credentials
- db_path = "_storage/path_test_quack.duckdb"
+ db_path = "path_test_quack.duckdb"
c = resolve_configuration(
DuckDbClientConfiguration(credentials=db_path)._bind_dataset_name(
dataset_name="test_dataset"
- )
+ ),
+ explicit_value={"bound_to_pipeline": p},
)
assert c.credentials._conn_str().lower() == os.path.abspath(db_path).lower()
conn = c.credentials.borrow_conn(read_only=False)
@@ -178,11 +215,12 @@ def test_duckdb_database_path() -> None:
assert os.path.isfile(db_path)
p = p.drop()
- db_path = os.path.abspath("_storage/abs_path_test_quack.duckdb")
+ db_path = os.path.abspath("abs_path_test_quack.duckdb")
c = resolve_configuration(
DuckDbClientConfiguration(credentials=db_path)._bind_dataset_name(
dataset_name="test_dataset"
- )
+ ),
+ explicit_value={"bound_to_pipeline": p},
)
assert os.path.isabs(c.credentials.database)
assert c.credentials._conn_str().lower() == db_path.lower()
@@ -196,7 +234,7 @@ def test_duckdb_database_path() -> None:
with pytest.raises(duckdb.IOException):
c = resolve_configuration(
- DuckDbClientConfiguration(credentials=TEST_STORAGE_ROOT)._bind_dataset_name(
+ DuckDbClientConfiguration(credentials=".")._bind_dataset_name(
dataset_name="test_dataset"
)
)
@@ -204,38 +242,61 @@ def test_duckdb_database_path() -> None:
def test_keeps_initial_db_path() -> None:
- db_path = "_storage/path_test_quack.duckdb"
- p = dlt.pipeline(
- pipeline_name="quack_pipeline", destination=dlt.destinations.duckdb(credentials=db_path)
- )
+ db_path = "path_test_quack.duckdb"
+ # this must be present in credentials so attach also sees it
+ os.environ["CREDENTIALS"] = db_path
+
+ p = dlt.pipeline(pipeline_name="quack_pipeline", destination=dlt.destinations.duckdb())
print(p.pipelines_dir)
+ assert p.state["_local"]["initial_cwd"] == os.path.abspath(os.path.curdir).lower()
with p.sql_client() as conn:
# still cwd
assert conn.credentials._conn_str().lower() == os.path.abspath(db_path).lower()
- # but it is kept in the local state
- assert p.get_local_state_val("duckdb_database").lower() == os.path.abspath(db_path).lower()
# attach the pipeline
p = dlt.attach(pipeline_name="quack_pipeline")
- assert p.get_local_state_val("duckdb_database").lower() == os.path.abspath(db_path).lower()
+ assert p.state["_local"]["initial_cwd"] == os.path.abspath(os.path.curdir).lower()
with p.sql_client() as conn:
# still cwd
- assert p.get_local_state_val("duckdb_database").lower() == os.path.abspath(db_path).lower()
assert conn.credentials._conn_str().lower() == os.path.abspath(db_path).lower()
# now create a new pipeline
dlt.pipeline(pipeline_name="not_quack", destination="dummy")
with p.sql_client() as conn:
# still cwd
- assert p.get_local_state_val("duckdb_database").lower() == os.path.abspath(db_path).lower()
- # new pipeline context took over
- # TODO: restore pipeline context on each call
- assert conn.credentials._conn_str().lower() != os.path.abspath(db_path).lower()
+ assert conn.credentials._conn_str().lower() == os.path.abspath(db_path).lower()
+
+
+def test_uses_duckdb_local_path_compat() -> None:
+ db_path = "./path_test_quack.duckdb"
+ p = dlt.pipeline(pipeline_name="quack_pipeline")
+ # old db location is still recognized
+ p.set_local_state_val("duckdb_database", os.path.abspath(db_path))
+ p = dlt.attach("quack_pipeline", destination="duckdb")
+ with p.sql_client() as conn:
+ # still cwd
+ assert conn.credentials._conn_str().lower() == os.path.abspath(db_path).lower()
+
+
+def test_drops_pipeline_changes_bound() -> None:
+ p = dlt.pipeline(pipeline_name="quack_pipeline", destination="duckdb")
+ p.run([1, 2, 3], table_name="p_table")
+ p = p.drop()
+ assert len(p._dataset().p_table.fetchall()) == 3
+
+ # drops internal duckdb
+ p = dlt.pipeline(pipeline_name="quack_pipeline", destination=duckdb(":pipeline:"))
+ p.run([1, 2, 3], table_name="p_table")
+ p = p.drop()
+ with pytest.raises(DatabaseUndefinedRelation):
+ p._dataset().p_table.fetchall()
def test_duckdb_database_delete() -> None:
- db_path = "_storage/path_test_quack.duckdb"
- p = dlt.pipeline(pipeline_name="quack_pipeline", destination=duckdb(credentials=db_path))
+ db_path = "./path_test_quack.duckdb"
+ os.environ["CREDENTIALS"] = db_path
+
+ p = dlt.pipeline(pipeline_name="quack_pipeline", destination="duckdb")
p.run([1, 2, 3], table_name="table", dataset_name="dataset")
# attach the pipeline
p = dlt.attach(pipeline_name="quack_pipeline")
@@ -244,18 +305,19 @@ def test_duckdb_database_delete() -> None:
os.remove(db_path)
p = dlt.attach(pipeline_name="quack_pipeline")
assert p.first_run is False
+ assert not os.path.exists(db_path)
p.run([1, 2, 3], table_name="table", dataset_name="dataset")
- # we reverted to a default path in cwd
- with pytest.raises(KeyError):
- p.get_local_state_val("duckdb_database")
+ assert os.path.exists(db_path)
def test_duck_database_path_delete() -> None:
# delete path
- db_folder = "_storage/db_path"
+ db_folder = "./db_path"
os.makedirs(db_folder)
db_path = f"{db_folder}/path_test_quack.duckdb"
- p = dlt.pipeline(pipeline_name="deep_quack_pipeline", destination=duckdb(credentials=db_path))
+ os.environ["CREDENTIALS"] = db_path
+
+ p = dlt.pipeline(pipeline_name="deep_quack_pipeline", destination="duckdb")
p.run([1, 2, 3], table_name="table", dataset_name="dataset")
# attach the pipeline
p = dlt.attach(pipeline_name="deep_quack_pipeline")
@@ -265,10 +327,15 @@ def test_duck_database_path_delete() -> None:
os.rmdir(db_folder)
p = dlt.attach(pipeline_name="deep_quack_pipeline")
assert p.first_run is False
- p.run([1, 2, 3], table_name="table", dataset_name="dataset")
- # we reverted to a default path in cwd
- with pytest.raises(KeyError):
- p.get_local_state_val("duckdb_database")
+
+ # we won't be able to recreate the database because folder was deleted
+ with pytest.raises(PipelineStepFailed) as py_ex:
+ p.run([1, 2, 3], table_name="table", dataset_name="dataset")
+ assert "No such file or directory" in str(py_ex.value)
+ # no database
+ assert not os.path.exists(db_path)
+ # restore folder, otherwise cleanup fails
+ os.makedirs(db_folder)
def test_case_sensitive_database_name() -> None:
diff --git a/tests/load/filesystem/test_filesystem_sql_secrets.py b/tests/load/filesystem/test_filesystem_sql_secrets.py
new file mode 100644
index 0000000000..4307ff7d12
--- /dev/null
+++ b/tests/load/filesystem/test_filesystem_sql_secrets.py
@@ -0,0 +1,112 @@
+from typing import Any
+
+import pytest
+import os
+from pytest_mock import MockerFixture
+
+from tests.utils import TEST_STORAGE_ROOT
+from tests.load.utils import (
+ destinations_configs,
+ DestinationTestConfiguration,
+ AWS_BUCKET,
+)
+from dlt.common.utils import uniq_id
+from dlt.common import logger
+
+
+@pytest.mark.essential
+@pytest.mark.parametrize(
+ "destination_config",
+ destinations_configs(all_buckets_filesystem_configs=True, bucket_subset=(AWS_BUCKET,)),
+ ids=lambda x: x.name,
+)
+def test_secrets_management(
+ destination_config: DestinationTestConfiguration, mocker: MockerFixture
+) -> None:
+ """Test the handling of secrets by the sql_client, we only need to do this on s3
+ as the other destinations work accordingly"""
+
+ # we can use fake keys
+ os.environ["DESTINATION__FILESYSTEM__CREDENTIALS__AWS_SECRET_ACCESS_KEY"] = "secret_key"
+ os.environ["DESTINATION__FILESYSTEM__CREDENTIALS__AWS_ACCESS_KEY_ID"] = "key"
+
+ warning_mesage = "You are persisting duckdb secrets but are storing them in the default folder"
+
+ logger_spy = mocker.spy(logger, "warn")
+
+ pipeline = destination_config.setup_pipeline(
+ "read_pipeline",
+ dataset_name="read_test",
+ )
+
+ import duckdb
+ from duckdb import HTTPException
+ from dlt.destinations.impl.filesystem.sql_client import (
+ FilesystemSqlClient,
+ DuckDbCredentials,
+ )
+
+ duck_db_location = TEST_STORAGE_ROOT + "/" + uniq_id()
+ secrets_dir = f"{TEST_STORAGE_ROOT}/duck_secrets_{uniq_id()}"
+
+ def _external_duckdb_connection() -> duckdb.DuckDBPyConnection:
+ external_db = duckdb.connect(duck_db_location)
+ external_db.sql(f"SET secret_directory = '{secrets_dir}';")
+ external_db.execute("CREATE SCHEMA IF NOT EXISTS first;")
+ return external_db
+
+ def _fs_sql_client_for_external_db(
+ connection: duckdb.DuckDBPyConnection,
+ ) -> FilesystemSqlClient:
+ return FilesystemSqlClient(
+ dataset_name="second",
+ fs_client=pipeline.destination_client(), # type: ignore
+ credentials=DuckDbCredentials(connection),
+ )
+
+ def _secrets_exist() -> bool:
+ return os.path.isdir(secrets_dir) and len(os.listdir(secrets_dir)) > 0
+
+ # first test what happens if there are no external secrets
+ external_db = _external_duckdb_connection()
+ fs_sql_client = _fs_sql_client_for_external_db(external_db)
+ with fs_sql_client as sql_client:
+ sql_client.create_views_for_tables({"items": "items"})
+ external_db.close()
+ assert not _secrets_exist()
+
+ # add secrets and check that they are there
+ external_db = _external_duckdb_connection()
+ fs_sql_client = _fs_sql_client_for_external_db(external_db)
+ with fs_sql_client as sql_client:
+ fs_sql_client.create_authentication(persistent=True)
+ assert _secrets_exist()
+
+ # remove secrets and check that they are removed
+ with fs_sql_client as sql_client:
+ fs_sql_client.drop_authentication()
+ assert not _secrets_exist()
+ external_db.close()
+
+ # prevent creating persistent secrets on in mem databases
+ fs_sql_client = FilesystemSqlClient(
+ dataset_name="second",
+ fs_client=pipeline.destination_client(), # type: ignore
+ )
+ with pytest.raises(Exception):
+ with fs_sql_client as sql_client:
+ fs_sql_client.create_authentication(persistent=True)
+
+ # check that no warning was logged
+ logger_spy.assert_not_called()
+
+ # check that warning is logged when secrets are persisted in the default folder
+ duck_db_location = TEST_STORAGE_ROOT + "/" + uniq_id()
+ secrets_dir = f"{TEST_STORAGE_ROOT}/duck_secrets_{uniq_id()}"
+ duck_db = duckdb.connect(duck_db_location)
+ fs_sql_client = _fs_sql_client_for_external_db(duck_db)
+ with fs_sql_client as sql_client:
+ sql_client.create_authentication(persistent=True)
+ logger_spy.assert_called_once()
+ assert warning_mesage in logger_spy.call_args_list[0][0][0]
+ duck_db.close()
diff --git a/tests/load/filesystem/test_object_store_rs_credentials.py b/tests/load/filesystem/test_object_store_rs_credentials.py
index c69521f6ea..f23187a269 100644
--- a/tests/load/filesystem/test_object_store_rs_credentials.py
+++ b/tests/load/filesystem/test_object_store_rs_credentials.py
@@ -2,6 +2,8 @@
from typing import Any, Dict
+import os
+import json # noqa: I251
import pytest
from deltalake import DeltaTable
from deltalake.exceptions import TableNotFoundError
@@ -14,9 +16,13 @@
AzureCredentialsWithoutDefaults,
AwsCredentials,
AwsCredentialsWithoutDefaults,
+ GcpCredentials,
GcpServiceAccountCredentialsWithoutDefaults,
GcpOAuthCredentialsWithoutDefaults,
)
+from dlt.common.utils import custom_environ
+from dlt.common.configuration.resolve import resolve_configuration
+from dlt.common.configuration.specs.gcp_credentials import GcpDefaultCredentials
from dlt.common.configuration.specs.exceptions import ObjectStoreRsCredentialsException
from tests.load.utils import (
@@ -27,6 +33,9 @@
ALL_FILESYSTEM_DRIVERS,
)
+
+pytestmark = pytest.mark.essential
+
if all(driver not in ALL_FILESYSTEM_DRIVERS for driver in ("az", "s3", "gs", "r2")):
pytest.skip(
"Requires at least one of `az`, `s3`, `gs`, `r2` in `ALL_FILESYSTEM_DRIVERS`.",
@@ -34,12 +43,14 @@
)
-FS_CREDS: Dict[str, Any] = dlt.secrets.get("destination.filesystem.credentials")
-if FS_CREDS is None:
- pytest.skip(
- msg="`destination.filesystem.credentials` must be configured for these tests.",
- allow_module_level=True,
- )
+@pytest.fixture
+def fs_creds() -> Dict[str, Any]:
+ creds: Dict[str, Any] = dlt.secrets.get("destination.filesystem.credentials")
+ if creds is None:
+ pytest.skip(
+ msg="`destination.filesystem.credentials` must be configured for these tests.",
+ )
+ return creds
def can_connect(bucket_url: str, object_store_rs_credentials: Dict[str, str]) -> bool:
@@ -53,7 +64,7 @@ def can_connect(bucket_url: str, object_store_rs_credentials: Dict[str, str]) ->
storage_options=object_store_rs_credentials,
)
except TableNotFoundError:
- # this error implies the connection was succesful
+ # this error implies the connection was successful
# there is no Delta table at `bucket_url`
return True
return False
@@ -62,7 +73,7 @@ def can_connect(bucket_url: str, object_store_rs_credentials: Dict[str, str]) ->
@pytest.mark.parametrize(
"driver", [driver for driver in ALL_FILESYSTEM_DRIVERS if driver in ("az")]
)
-def test_azure_object_store_rs_credentials(driver: str) -> None:
+def test_azure_object_store_rs_credentials(driver: str, fs_creds: Dict[str, Any]) -> None:
creds: AnyAzureCredentials
creds = AzureServicePrincipalCredentialsWithoutDefaults(
@@ -72,8 +83,8 @@ def test_azure_object_store_rs_credentials(driver: str) -> None:
# without SAS token
creds = AzureCredentialsWithoutDefaults(
- azure_storage_account_name=FS_CREDS["azure_storage_account_name"],
- azure_storage_account_key=FS_CREDS["azure_storage_account_key"],
+ azure_storage_account_name=fs_creds["azure_storage_account_name"],
+ azure_storage_account_key=fs_creds["azure_storage_account_key"],
)
assert creds.azure_storage_sas_token is None
assert can_connect(AZ_BUCKET, creds.to_object_store_rs_credentials())
@@ -87,10 +98,9 @@ def test_azure_object_store_rs_credentials(driver: str) -> None:
@pytest.mark.parametrize(
"driver", [driver for driver in ALL_FILESYSTEM_DRIVERS if driver in ("s3", "r2")]
)
-def test_aws_object_store_rs_credentials(driver: str) -> None:
+def test_aws_object_store_rs_credentials(driver: str, fs_creds: Dict[str, Any]) -> None:
creds: AwsCredentialsWithoutDefaults
- fs_creds = FS_CREDS
if driver == "r2":
fs_creds = R2_BUCKET_CONFIG["credentials"] # type: ignore[assignment]
@@ -168,16 +178,36 @@ def test_aws_object_store_rs_credentials(driver: str) -> None:
@pytest.mark.parametrize(
"driver", [driver for driver in ALL_FILESYSTEM_DRIVERS if driver in ("gs")]
)
-def test_gcp_object_store_rs_credentials(driver) -> None:
+def test_gcp_object_store_rs_credentials(driver, fs_creds: Dict[str, Any]) -> None:
+ creds: GcpCredentials
+
+ # GcpServiceAccountCredentialsWithoutDefaults
creds = GcpServiceAccountCredentialsWithoutDefaults(
- project_id=FS_CREDS["project_id"],
- private_key=FS_CREDS["private_key"],
+ project_id=fs_creds["project_id"],
+ private_key=fs_creds["private_key"],
# private_key_id must be configured in order for data lake to work
- private_key_id=FS_CREDS["private_key_id"],
- client_email=FS_CREDS["client_email"],
+ private_key_id=fs_creds["private_key_id"],
+ client_email=fs_creds["client_email"],
)
assert can_connect(GCS_BUCKET, creds.to_object_store_rs_credentials())
+ # GcpDefaultCredentials
+
+ # reset failed default credentials timeout so we resolve below
+ GcpDefaultCredentials._LAST_FAILED_DEFAULT = 0
+
+ # write service account key to JSON file
+ service_json = json.loads(creds.to_object_store_rs_credentials()["service_account_key"])
+ path = "_secrets/service.json"
+ os.makedirs(os.path.dirname(path), exist_ok=True)
+ with open(path, "w", encoding="utf-8") as f:
+ json.dump(service_json, f)
+
+ with custom_environ({"GOOGLE_APPLICATION_CREDENTIALS": path}):
+ creds = GcpDefaultCredentials()
+ resolve_configuration(creds)
+ can_connect(GCS_BUCKET, creds.to_object_store_rs_credentials())
+
# GcpOAuthCredentialsWithoutDefaults is currently not supported
with pytest.raises(NotImplementedError):
GcpOAuthCredentialsWithoutDefaults().to_object_store_rs_credentials()
diff --git a/tests/load/filesystem/test_sql_client.py b/tests/load/filesystem/test_sql_client.py
index a5344e14e1..ac2ada2551 100644
--- a/tests/load/filesystem/test_sql_client.py
+++ b/tests/load/filesystem/test_sql_client.py
@@ -6,6 +6,9 @@
import pytest
import dlt
import os
+import shutil
+import logging
+
from dlt import Pipeline
from dlt.common.utils import uniq_id
@@ -16,21 +19,31 @@
GCS_BUCKET,
SFTP_BUCKET,
MEMORY_BUCKET,
+ AWS_BUCKET,
)
from dlt.destinations import filesystem
from tests.utils import TEST_STORAGE_ROOT
from dlt.destinations.exceptions import DatabaseUndefinedRelation
+@pytest.fixture(scope="function", autouse=True)
+def secret_directory():
+ secrets_dir = f"{TEST_STORAGE_ROOT}/duck_secrets_{uniq_id()}"
+ yield secrets_dir
+ shutil.rmtree(secrets_dir, ignore_errors=True)
+
+
def _run_dataset_checks(
pipeline: Pipeline,
destination_config: DestinationTestConfiguration,
+ secret_directory: str,
table_format: Any = None,
alternate_access_pipeline: Pipeline = None,
) -> None:
total_records = 200
- TEST_SECRET_NAME = "TEST_SECRET" + uniq_id()
+ # duckdb will store secrets lower case, that's why we could not delete it
+ TEST_SECRET_NAME = "test_secret_" + uniq_id()
# only some buckets have support for persistent secrets
needs_persistent_secrets = (
@@ -130,6 +143,7 @@ def _external_duckdb_connection() -> duckdb.DuckDBPyConnection:
external_db = duckdb.connect(duck_db_location)
# the line below solves problems with certificate path lookup on linux, see duckdb docs
external_db.sql("SET azure_transport_option_type = 'curl';")
+ external_db.sql(f"SET secret_directory = '{secret_directory}';")
return external_db
def _fs_sql_client_for_external_db(
@@ -141,7 +155,7 @@ def _fs_sql_client_for_external_db(
credentials=DuckDbCredentials(connection),
)
- # we create a duckdb with a table an see wether we can add more views from the fs client
+ # we create a duckdb with a table an see whether we can add more views from the fs client
external_db = _external_duckdb_connection()
external_db.execute("CREATE SCHEMA first;")
external_db.execute("CREATE SCHEMA second;")
@@ -159,7 +173,7 @@ def _fs_sql_client_for_external_db(
assert len(external_db.sql("SELECT * FROM first.items").fetchall()) == 3
external_db.close()
- # in case we are not connecting to a bucket, views should still be here after connection reopen
+ # in case we are not connecting to a bucket that needs secrets, views should still be here after connection reopen
if not needs_persistent_secrets and not unsupported_persistent_secrets:
external_db = _external_duckdb_connection()
assert (
@@ -178,39 +192,45 @@ def _fs_sql_client_for_external_db(
)
external_db.close()
- # gs does not support persistent secrest, so we can't do further checks
+ # gs does not support persistent secrets, so we can't do further checks
if unsupported_persistent_secrets:
return
# create secret
external_db = _external_duckdb_connection()
fs_sql_client = _fs_sql_client_for_external_db(external_db)
- with fs_sql_client as sql_client:
- fs_sql_client.create_authentication(persistent=True, secret_name=TEST_SECRET_NAME)
- external_db.close()
-
- # now this should work
- external_db = _external_duckdb_connection()
- assert len(external_db.sql("SELECT * FROM second.referenced_items").fetchall()) == total_records
-
- # NOTE: when running this on CI, there seem to be some kind of race conditions that prevent
- # secrets from being removed as it does not find the file... We'll need to investigate this.
- return
- # now drop the secrets again
- fs_sql_client = _fs_sql_client_for_external_db(external_db)
- with fs_sql_client as sql_client:
- fs_sql_client.drop_authentication(TEST_SECRET_NAME)
- external_db.close()
+ try:
+ with fs_sql_client as sql_client:
+ fs_sql_client.create_authentication(persistent=True, secret_name=TEST_SECRET_NAME)
+ external_db.close()
- # fails again
- external_db = _external_duckdb_connection()
- with pytest.raises((HTTPException, IOException, InvalidInputException)):
+ # now this should work
+ external_db = _external_duckdb_connection()
assert (
len(external_db.sql("SELECT * FROM second.referenced_items").fetchall())
== total_records
)
- external_db.close()
+
+ # now drop the secrets again
+ fs_sql_client = _fs_sql_client_for_external_db(external_db)
+ with fs_sql_client as sql_client:
+ fs_sql_client.drop_authentication(TEST_SECRET_NAME)
+ external_db.close()
+
+ # fails again
+ external_db = _external_duckdb_connection()
+ with pytest.raises((HTTPException, IOException, InvalidInputException)):
+ assert (
+ len(external_db.sql("SELECT * FROM second.referenced_items").fetchall())
+ == total_records
+ )
+ external_db.close()
+ finally:
+ with duckdb.connect() as conn:
+ fs_sql_client = _fs_sql_client_for_external_db(conn)
+ with fs_sql_client as sql_client:
+ fs_sql_client.drop_authentication(TEST_SECRET_NAME)
@pytest.mark.essential
@@ -223,14 +243,21 @@ def _fs_sql_client_for_external_db(
), # TODO: make SFTP work
ids=lambda x: x.name,
)
-def test_read_interfaces_filesystem(destination_config: DestinationTestConfiguration) -> None:
+@pytest.mark.parametrize("disable_compression", [True, False])
+def test_read_interfaces_filesystem(
+ destination_config: DestinationTestConfiguration,
+ disable_compression: bool,
+ secret_directory: str,
+) -> None:
# we force multiple files per table, they may only hold 700 items
os.environ["DATA_WRITER__FILE_MAX_ITEMS"] = "700"
-
+ destination_config.disable_compression = disable_compression
if destination_config.file_format not in ["parquet", "jsonl"]:
pytest.skip(
f"Test only works for jsonl and parquet, given: {destination_config.file_format}"
)
+ if destination_config.file_format in ["parquet"] and disable_compression:
+ pytest.skip("Disabling compression for parquet has no effect, skipping test")
pipeline = destination_config.setup_pipeline(
"read_pipeline",
@@ -238,7 +265,7 @@ def test_read_interfaces_filesystem(destination_config: DestinationTestConfigura
dev_mode=True,
)
- _run_dataset_checks(pipeline, destination_config)
+ _run_dataset_checks(pipeline, destination_config, secret_directory=secret_directory)
# for gcs buckets we additionally test the s3 compat layer
if destination_config.bucket_url == GCS_BUCKET:
@@ -248,7 +275,7 @@ def test_read_interfaces_filesystem(destination_config: DestinationTestConfigura
pipeline = destination_config.setup_pipeline(
"read_pipeline", dataset_name="read_test", dev_mode=True, destination=gcp_bucket
)
- _run_dataset_checks(pipeline, destination_config)
+ _run_dataset_checks(pipeline, destination_config, secret_directory=secret_directory)
@pytest.mark.essential
@@ -262,7 +289,9 @@ def test_read_interfaces_filesystem(destination_config: DestinationTestConfigura
),
ids=lambda x: x.name,
)
-def test_delta_tables(destination_config: DestinationTestConfiguration) -> None:
+def test_delta_tables(
+ destination_config: DestinationTestConfiguration, secret_directory: str
+) -> None:
os.environ["DATA_WRITER__FILE_MAX_ITEMS"] = "700"
pipeline = destination_config.setup_pipeline(
@@ -285,6 +314,7 @@ def test_delta_tables(destination_config: DestinationTestConfiguration) -> None:
_run_dataset_checks(
pipeline,
destination_config,
+ secret_directory=secret_directory,
table_format="delta",
alternate_access_pipeline=access_pipeline,
)
@@ -296,9 +326,12 @@ def test_delta_tables(destination_config: DestinationTestConfiguration) -> None:
destinations_configs(local_filesystem_configs=True),
ids=lambda x: x.name,
)
-def test_evolving_filesystem(destination_config: DestinationTestConfiguration) -> None:
+@pytest.mark.parametrize("disable_compression", [True, False])
+def test_evolving_filesystem(
+ destination_config: DestinationTestConfiguration, disable_compression: bool
+) -> None:
"""test that files with unequal schemas still work together"""
-
+ destination_config.disable_compression = disable_compression
if destination_config.file_format not in ["parquet", "jsonl"]:
pytest.skip(
f"Test only works for jsonl and parquet, given: {destination_config.file_format}"
diff --git a/tests/load/lancedb/test_merge.py b/tests/load/lancedb/test_merge.py
new file mode 100644
index 0000000000..f04c846df7
--- /dev/null
+++ b/tests/load/lancedb/test_merge.py
@@ -0,0 +1,425 @@
+from typing import Iterator, List, Generator, Any
+
+import numpy as np
+import pandas as pd
+import pytest
+from lancedb.table import Table # type: ignore
+from pandas import DataFrame
+from pandas.testing import assert_frame_equal
+
+import dlt
+from dlt.common.typing import DictStrAny, DictStrStr
+from dlt.common.utils import uniq_id
+from dlt.destinations.impl.lancedb.lancedb_adapter import (
+ lancedb_adapter,
+)
+from tests.load.lancedb.utils import chunk_document
+from tests.load.utils import (
+ drop_active_pipeline_data,
+ sequence_generator,
+)
+from tests.pipeline.utils import (
+ assert_load_info,
+)
+
+
+# Mark all tests as essential, don't remove.
+pytestmark = pytest.mark.essential
+
+
+@pytest.fixture(autouse=True)
+def drop_lancedb_data() -> Iterator[None]:
+ yield
+ drop_active_pipeline_data()
+
+
+def test_lancedb_remove_nested_orphaned_records() -> None:
+ pipeline = dlt.pipeline(
+ pipeline_name="test_lancedb_remove_orphaned_records",
+ destination="lancedb",
+ dataset_name=f"test_lancedb_remove_orphaned_records_{uniq_id()}",
+ dev_mode=True,
+ )
+
+ @dlt.resource(
+ table_name="parent",
+ write_disposition={"disposition": "merge", "strategy": "upsert"},
+ primary_key="id",
+ merge_key="id",
+ )
+ def identity_resource(
+ data: List[DictStrAny],
+ ) -> Generator[List[DictStrAny], None, None]:
+ yield data
+
+ run_1 = [
+ {
+ "id": 1,
+ "child": [
+ {"bar": 1, "grandchild": [{"baz": 1}, {"baz": 2}]},
+ {"bar": 2, "grandchild": [{"baz": 3}]},
+ ],
+ },
+ {"id": 2, "child": [{"bar": 3, "grandchild": [{"baz": 4}]}]},
+ {
+ "id": 3,
+ "child": [
+ {"bar": 10, "grandchild": [{"baz": 5}]},
+ {"bar": 11, "grandchild": [{"baz": 6}, {"baz": 7}]},
+ ],
+ },
+ ]
+ info = pipeline.run(identity_resource(run_1))
+ assert_load_info(info)
+
+ run_2 = [
+ {
+ "id": 1,
+ "child": [{"bar": 1, "grandchild": [{"baz": 1}]}],
+ }, # Removes bar_2, baz_2 and baz_3.
+ {
+ "id": 2,
+ "child": [{"bar": 4, "grandchild": [{"baz": 8}]}],
+ }, # Removes bar_3, baz_4.
+ ]
+ info = pipeline.run(identity_resource(run_2))
+ assert_load_info(info)
+
+ with pipeline.destination_client() as client:
+ expected_parent_data = pd.DataFrame(
+ data=[
+ {"id": 1},
+ {"id": 2},
+ {"id": 3},
+ ]
+ )
+
+ expected_child_data = pd.DataFrame(
+ data=[
+ {"bar": 1},
+ {"bar": 4},
+ {"bar": 10},
+ {"bar": 11},
+ ]
+ )
+
+ expected_grandchild_data = pd.DataFrame(
+ data=[
+ {"baz": 1},
+ {"baz": 8},
+ {"baz": 5},
+ {"baz": 6},
+ {"baz": 7},
+ ]
+ )
+
+ parent_table_name = client.make_qualified_table_name("parent") # type: ignore[attr-defined]
+ child_table_name = client.make_qualified_table_name("parent__child") # type: ignore[attr-defined]
+ grandchild_table_name = client.make_qualified_table_name( # type: ignore[attr-defined]
+ "parent__child__grandchild"
+ )
+
+ parent_tbl = client.db_client.open_table(parent_table_name) # type: ignore[attr-defined]
+ child_tbl = client.db_client.open_table(child_table_name) # type: ignore[attr-defined]
+ grandchild_tbl = client.db_client.open_table(grandchild_table_name) # type: ignore[attr-defined]
+
+ actual_parent_df = parent_tbl.to_pandas().sort_values(by="id").reset_index(drop=True)
+ actual_child_df = child_tbl.to_pandas().sort_values(by="bar").reset_index(drop=True)
+ actual_grandchild_df = (
+ grandchild_tbl.to_pandas().sort_values(by="baz").reset_index(drop=True)
+ )
+
+ expected_parent_data = expected_parent_data.sort_values(by="id").reset_index(drop=True)
+ expected_child_data = expected_child_data.sort_values(by="bar").reset_index(drop=True)
+ expected_grandchild_data = expected_grandchild_data.sort_values(by="baz").reset_index(
+ drop=True
+ )
+
+ assert_frame_equal(actual_parent_df[["id"]], expected_parent_data)
+ assert_frame_equal(actual_child_df[["bar"]], expected_child_data)
+ assert_frame_equal(actual_grandchild_df[["baz"]], expected_grandchild_data)
+
+
+def test_lancedb_remove_orphaned_records_root_table() -> None:
+ pipeline = dlt.pipeline(
+ pipeline_name="test_lancedb_remove_orphaned_records_root_table",
+ destination="lancedb",
+ dataset_name=f"test_lancedb_remove_orphaned_records_root_table_{uniq_id()}",
+ dev_mode=True,
+ )
+
+ @dlt.resource(
+ table_name="root",
+ write_disposition={"disposition": "merge", "strategy": "upsert"},
+ primary_key=["doc_id", "chunk_hash"],
+ merge_key=["doc_id"],
+ )
+ def identity_resource(
+ data: List[DictStrAny],
+ ) -> Generator[List[DictStrAny], None, None]:
+ yield data
+
+ lancedb_adapter(identity_resource)
+
+ run_1 = [
+ {"doc_id": 1, "chunk_hash": "1a"},
+ {"doc_id": 2, "chunk_hash": "2a"},
+ {"doc_id": 2, "chunk_hash": "2b"},
+ {"doc_id": 2, "chunk_hash": "2c"},
+ {"doc_id": 3, "chunk_hash": "3a"},
+ {"doc_id": 3, "chunk_hash": "3b"},
+ ]
+ info = pipeline.run(identity_resource(run_1))
+ assert_load_info(info)
+
+ run_2 = [
+ {"doc_id": 2, "chunk_hash": "2d"},
+ {"doc_id": 2, "chunk_hash": "2e"},
+ {"doc_id": 3, "chunk_hash": "3b"},
+ ]
+ info = pipeline.run(identity_resource(run_2))
+ assert_load_info(info)
+
+ with pipeline.destination_client() as client:
+ expected_root_table_df = (
+ pd.DataFrame(
+ data=[
+ {"doc_id": 1, "chunk_hash": "1a"},
+ {"doc_id": 2, "chunk_hash": "2d"},
+ {"doc_id": 2, "chunk_hash": "2e"},
+ {"doc_id": 3, "chunk_hash": "3b"},
+ ]
+ )
+ .sort_values(by=["doc_id", "chunk_hash"])
+ .reset_index(drop=True)
+ )
+
+ root_table_name = client.make_qualified_table_name("root") # type: ignore[attr-defined]
+ tbl = client.db_client.open_table(root_table_name) # type: ignore[attr-defined]
+
+ actual_root_df: DataFrame = (
+ tbl.to_pandas().sort_values(by=["doc_id", "chunk_hash"]).reset_index(drop=True)
+ )[["doc_id", "chunk_hash"]]
+
+ assert_frame_equal(actual_root_df, expected_root_table_df)
+
+
+def test_lancedb_remove_orphaned_records_root_table_string_doc_id() -> None:
+ pipeline = dlt.pipeline(
+ pipeline_name="test_lancedb_remove_orphaned_records_root_table",
+ destination="lancedb",
+ dataset_name=f"test_lancedb_remove_orphaned_records_root_table_{uniq_id()}",
+ dev_mode=True,
+ )
+
+ @dlt.resource(
+ table_name="root",
+ write_disposition={"disposition": "merge", "strategy": "upsert"},
+ primary_key=["doc_id", "chunk_hash"],
+ merge_key=["doc_id"],
+ )
+ def identity_resource(
+ data: List[DictStrAny],
+ ) -> Generator[List[DictStrAny], None, None]:
+ yield data
+
+ lancedb_adapter(identity_resource)
+
+ run_1 = [
+ {"doc_id": "A", "chunk_hash": "1a"},
+ {"doc_id": "B", "chunk_hash": "2a"},
+ {"doc_id": "B", "chunk_hash": "2b"},
+ {"doc_id": "B", "chunk_hash": "2c"},
+ {"doc_id": "C", "chunk_hash": "3a"},
+ {"doc_id": "C", "chunk_hash": "3b"},
+ ]
+ info = pipeline.run(identity_resource(run_1))
+ assert_load_info(info)
+
+ run_2 = [
+ {"doc_id": "B", "chunk_hash": "2d"},
+ {"doc_id": "B", "chunk_hash": "2e"},
+ {"doc_id": "C", "chunk_hash": "3b"},
+ ]
+ info = pipeline.run(identity_resource(run_2))
+ assert_load_info(info)
+
+ with pipeline.destination_client() as client:
+ expected_root_table_df = (
+ pd.DataFrame(
+ data=[
+ {"doc_id": "A", "chunk_hash": "1a"},
+ {"doc_id": "B", "chunk_hash": "2d"},
+ {"doc_id": "B", "chunk_hash": "2e"},
+ {"doc_id": "C", "chunk_hash": "3b"},
+ ]
+ )
+ .sort_values(by=["doc_id", "chunk_hash"])
+ .reset_index(drop=True)
+ )
+
+ root_table_name = client.make_qualified_table_name("root") # type: ignore[attr-defined]
+ tbl = client.db_client.open_table(root_table_name) # type: ignore[attr-defined]
+
+ actual_root_df: DataFrame = (
+ tbl.to_pandas().sort_values(by=["doc_id", "chunk_hash"]).reset_index(drop=True)
+ )[["doc_id", "chunk_hash"]]
+
+ assert_frame_equal(actual_root_df, expected_root_table_df)
+
+
+def test_lancedb_root_table_remove_orphaned_records_with_real_embeddings() -> None:
+ @dlt.resource(
+ write_disposition={"disposition": "merge", "strategy": "upsert"},
+ table_name="document",
+ primary_key=["doc_id", "chunk"],
+ merge_key="doc_id",
+ )
+ def documents(docs: List[DictStrAny]) -> Generator[DictStrAny, None, None]:
+ for doc in docs:
+ doc_id = doc["doc_id"]
+ for chunk in chunk_document(doc["text"]):
+ yield {"doc_id": doc_id, "doc_text": doc["text"], "chunk": chunk}
+
+ @dlt.source()
+ def documents_source(
+ docs: List[DictStrAny],
+ ) -> Any:
+ return documents(docs)
+
+ lancedb_adapter(
+ documents,
+ embed=["chunk"],
+ )
+
+ pipeline = dlt.pipeline(
+ pipeline_name="test_lancedb_remove_orphaned_records_with_embeddings",
+ destination="lancedb",
+ dataset_name=f"test_lancedb_remove_orphaned_records_{uniq_id()}",
+ dev_mode=True,
+ )
+
+ initial_docs = [
+ {
+ "text": (
+ "This is the first document. It contains some text that will be chunked and"
+ " embedded. (I don't want to be seen in updated run's embedding chunk texts btw)"
+ ),
+ "doc_id": 1,
+ },
+ {
+ "text": "Here's another document. It's a bit different from the first one.",
+ "doc_id": 2,
+ },
+ ]
+
+ info = pipeline.run(documents_source(initial_docs))
+ assert_load_info(info)
+
+ updated_docs = [
+ {
+ "text": "This is the first document, but it has been updated with new content.",
+ "doc_id": 1,
+ },
+ {
+ "text": "This is a completely new document that wasn't in the initial set.",
+ "doc_id": 3,
+ },
+ ]
+
+ info = pipeline.run(documents_source(updated_docs))
+ assert_load_info(info)
+
+ with pipeline.destination_client() as client:
+ embeddings_table_name = client.make_qualified_table_name("document") # type: ignore[attr-defined]
+ tbl: Table = client.db_client.open_table(embeddings_table_name) # type: ignore[attr-defined]
+ df = tbl.to_pandas()
+
+ # Check (non-empty) embeddings as present, and that orphaned embeddings have been discarded.
+ assert len(df) == 21
+ assert "vector" in df.columns
+ for _, vector in enumerate(df["vector"]):
+ assert isinstance(vector, np.ndarray)
+ assert vector.size > 0
+
+
+def test_lancedb_compound_merge_key_root_table() -> None:
+ pipeline = dlt.pipeline(
+ pipeline_name="test_lancedb_compound_merge_key",
+ destination="lancedb",
+ dataset_name=f"test_lancedb_remove_orphaned_records_root_table_{uniq_id()}",
+ dev_mode=True,
+ )
+
+ @dlt.resource(
+ table_name="root",
+ write_disposition={"disposition": "merge", "strategy": "upsert"},
+ primary_key=["doc_id", "chunk_hash"],
+ merge_key=["doc_id", "chunk_hash"],
+ )
+ def identity_resource(
+ data: List[DictStrAny],
+ ) -> Generator[List[DictStrAny], None, None]:
+ yield data
+
+ lancedb_adapter(identity_resource, no_remove_orphans=True)
+
+ run_1 = [
+ {"doc_id": 1, "chunk_hash": "a", "foo": "bar"},
+ {"doc_id": 1, "chunk_hash": "b", "foo": "coo"},
+ ]
+ info = pipeline.run(identity_resource(run_1))
+ assert_load_info(info)
+
+ run_2 = [
+ {"doc_id": 1, "chunk_hash": "a", "foo": "aat"},
+ {"doc_id": 1, "chunk_hash": "c", "foo": "loot"},
+ ]
+ info = pipeline.run(identity_resource(run_2))
+ assert_load_info(info)
+
+ with pipeline.destination_client() as client:
+ expected_root_table_df = (
+ pd.DataFrame(
+ data=[
+ {"doc_id": 1, "chunk_hash": "a", "foo": "aat"},
+ {"doc_id": 1, "chunk_hash": "b", "foo": "coo"},
+ {"doc_id": 1, "chunk_hash": "c", "foo": "loot"},
+ ]
+ )
+ .sort_values(by=["doc_id", "chunk_hash", "foo"])
+ .reset_index(drop=True)
+ )
+
+ root_table_name = client.make_qualified_table_name("root") # type: ignore[attr-defined]
+ tbl = client.db_client.open_table(root_table_name) # type: ignore[attr-defined]
+
+ actual_root_df: DataFrame = (
+ tbl.to_pandas().sort_values(by=["doc_id", "chunk_hash", "foo"]).reset_index(drop=True)
+ )[["doc_id", "chunk_hash", "foo"]]
+
+ assert_frame_equal(actual_root_df, expected_root_table_df)
+
+
+def test_must_provide_at_least_primary_key_on_merge_disposition() -> None:
+ """We need upsert merge's deterministic _dlt_id to perform orphan removal.
+ Hence, we require at least the primary key required (raises exception if missing).
+ Specify a merge key for custom orphan identification."""
+ generator_instance1 = sequence_generator()
+
+ @dlt.resource(write_disposition={"disposition": "merge", "strategy": "upsert"})
+ def some_data() -> Generator[DictStrStr, Any, None]:
+ yield from next(generator_instance1)
+
+ pipeline = dlt.pipeline(
+ pipeline_name="test_must_provide_both_primary_and_merge_key_on_merge_disposition",
+ destination="lancedb",
+ dataset_name=(
+ f"test_must_provide_both_primary_and_merge_key_on_merge_disposition{uniq_id()}"
+ ),
+ )
+ with pytest.raises(Exception):
+ load_info = pipeline.run(
+ some_data(),
+ )
+ assert_load_info(load_info)
diff --git a/tests/load/lancedb/test_pipeline.py b/tests/load/lancedb/test_pipeline.py
index 6cd0abd587..345934fb29 100644
--- a/tests/load/lancedb/test_pipeline.py
+++ b/tests/load/lancedb/test_pipeline.py
@@ -1,25 +1,30 @@
import multiprocessing
-from typing import Iterator, Generator, Any, List, Mapping
+import os
+from typing import Iterator, Generator, Any, List
+from typing import Mapping
+from typing import Union, Dict
import pytest
-import lancedb # type: ignore
-from lancedb import DBConnection
+from lancedb import DBConnection # type: ignore
from lancedb.embeddings import EmbeddingFunctionRegistry # type: ignore
+from lancedb.table import Table # type: ignore
import dlt
from dlt.common import json
-from dlt.common.typing import DictStrStr, DictStrAny
-from dlt.common.utils import uniq_id
+from dlt.common.typing import DictStrAny
+from dlt.common.typing import DictStrStr
+from dlt.common.utils import uniq_id, digest128
from dlt.destinations.impl.lancedb.lancedb_adapter import (
lancedb_adapter,
VECTORIZE_HINT,
)
from dlt.destinations.impl.lancedb.lancedb_client import LanceDBClient
-from tests.load.lancedb.utils import assert_table
+from dlt.extract import DltResource
+from tests.load.lancedb.utils import assert_table, chunk_document, mock_embed
from tests.load.utils import sequence_generator, drop_active_pipeline_data
from tests.pipeline.utils import assert_load_info
-# Mark all tests as essential, do not remove.
+# Mark all tests as essential, don't remove.
pytestmark = pytest.mark.essential
@@ -49,6 +54,22 @@ def some_data() -> Generator[DictStrStr, Any, None]:
"x-lancedb-embed": True,
}
+ lancedb_adapter(
+ some_data,
+ merge_key="content",
+ )
+
+ # via merge_key
+ assert some_data._hints["merge_key"] == "content"
+
+ assert some_data.columns["content"] == { # type: ignore
+ "name": "content",
+ "data_type": "text",
+ "x-lancedb-embed": True,
+ }
+
+ assert some_data.compute_table_schema()["columns"]["content"]["merge_key"] is True
+
def test_basic_state_and_schema() -> None:
generator_instance1 = sequence_generator()
@@ -118,14 +139,13 @@ def some_data() -> Generator[DictStrStr, Any, None]:
def test_explicit_append() -> None:
- """Append should work even when the primary key is specified."""
data = [
{"doc_id": 1, "content": "1"},
{"doc_id": 2, "content": "2"},
{"doc_id": 3, "content": "3"},
]
- @dlt.resource(primary_key="doc_id")
+ @dlt.resource()
def some_data() -> Generator[List[DictStrAny], Any, None]:
yield data
@@ -142,6 +162,7 @@ def some_data() -> Generator[List[DictStrAny], Any, None]:
info = pipeline.run(
some_data(),
)
+ assert_load_info(info)
assert_table(pipeline, "some_data", items=data)
@@ -156,25 +177,22 @@ def some_data() -> Generator[List[DictStrAny], Any, None]:
def test_pipeline_replace() -> None:
- generator_instance1 = sequence_generator()
- generator_instance2 = sequence_generator()
+ os.environ["DATA_WRITER__BUFFER_MAX_ITEMS"] = "2"
+ os.environ["DATA_WRITER__FILE_MAX_ITEMS"] = "2"
+
+ generator_instance1, generator_instance2 = (sequence_generator(), sequence_generator())
@dlt.resource
def some_data() -> Generator[DictStrStr, Any, None]:
yield from next(generator_instance1)
- lancedb_adapter(
- some_data,
- embed=["content"],
- )
-
uid = uniq_id()
pipeline = dlt.pipeline(
pipeline_name="test_pipeline_replace",
destination="lancedb",
dataset_name="test_pipeline_replace_dataset"
- + uid, # lancedb doesn't mandate any name normalization
+ + uid, # Lancedb doesn't mandate any name normalization.
)
info = pipeline.run(
@@ -263,23 +281,11 @@ def test_pipeline_merge() -> None:
},
]
- @dlt.resource(primary_key="doc_id")
+ @dlt.resource(primary_key=["doc_id"])
def movies_data() -> Any:
yield data
- @dlt.resource(primary_key="doc_id", merge_key=["merge_id", "title"])
- def movies_data_explicit_merge_keys() -> Any:
- yield data
-
- lancedb_adapter(
- movies_data,
- embed=["description"],
- )
-
- lancedb_adapter(
- movies_data_explicit_merge_keys,
- embed=["description"],
- )
+ lancedb_adapter(movies_data, embed=["description"], no_remove_orphans=True)
pipeline = dlt.pipeline(
pipeline_name="movies",
@@ -288,7 +294,7 @@ def movies_data_explicit_merge_keys() -> Any:
)
info = pipeline.run(
movies_data(),
- write_disposition="merge",
+ write_disposition={"disposition": "merge", "strategy": "upsert"},
dataset_name=f"MoviesDataset{uniq_id()}",
)
assert_load_info(info)
@@ -299,26 +305,11 @@ def movies_data_explicit_merge_keys() -> Any:
info = pipeline.run(
movies_data(),
- write_disposition="merge",
+ write_disposition={"disposition": "merge", "strategy": "upsert"},
)
assert_load_info(info)
assert_table(pipeline, "movies_data", items=data)
- info = pipeline.run(
- movies_data(),
- write_disposition="merge",
- )
- assert_load_info(info)
- assert_table(pipeline, "movies_data", items=data)
-
- # Test with explicit merge keys.
- info = pipeline.run(
- movies_data_explicit_merge_keys(),
- write_disposition="merge",
- )
- assert_load_info(info)
- assert_table(pipeline, "movies_data_explicit_merge_keys", items=data)
-
def test_pipeline_with_schema_evolution() -> None:
data = [
@@ -388,9 +379,9 @@ def test_merge_github_nested() -> None:
data = json.load(f)
info = pipe.run(
- lancedb_adapter(data[:17], embed=["title", "body"]),
+ lancedb_adapter(data[:17], embed=["title", "body"], no_remove_orphans=True),
table_name="issues",
- write_disposition="merge",
+ write_disposition={"disposition": "merge", "strategy": "upsert"},
primary_key="id",
)
assert_load_info(info)
@@ -426,18 +417,116 @@ def test_merge_github_nested() -> None:
def test_empty_dataset_allowed() -> None:
# dataset_name is optional so dataset name won't be autogenerated when not explicitly passed.
pipe = dlt.pipeline(destination="lancedb", dev_mode=True)
- client: LanceDBClient = pipe.destination_client() # type: ignore[assignment]
assert pipe.dataset_name is None
info = pipe.run(lancedb_adapter(["context", "created", "not a stop word"], embed=["value"]))
# Dataset in load info is empty.
assert info.dataset_name is None
- client = pipe.destination_client() # type: ignore[assignment]
- assert client.dataset_name is None
- assert client.sentinel_table == "dltSentinelTable"
+ client = pipe.destination_client()
+ assert client.dataset_name is None # type: ignore
+ assert client.sentinel_table == "dltSentinelTable" # type: ignore
assert_table(pipe, "content", expected_items_count=3)
+def test_lancedb_remove_nested_orphaned_records_with_chunks() -> None:
+ @dlt.resource(
+ write_disposition={"disposition": "merge", "strategy": "upsert"},
+ table_name="document",
+ primary_key=["doc_id"],
+ merge_key=["doc_id"],
+ )
+ def documents(docs: List[DictStrAny]) -> Generator[DictStrAny, None, None]:
+ for doc in docs:
+ doc_id = doc["doc_id"]
+ chunks = chunk_document(doc["text"])
+ embeddings = [
+ {
+ "chunk_hash": digest128(chunk),
+ "chunk_text": chunk,
+ "embedding": mock_embed(),
+ }
+ for chunk in chunks
+ ]
+ yield {"doc_id": doc_id, "doc_text": doc["text"], "embeddings": embeddings}
+
+ @dlt.source(max_table_nesting=1)
+ def documents_source(
+ docs: List[DictStrAny],
+ ) -> Union[Generator[Dict[str, Any], None, None], DltResource]:
+ return documents(docs)
+
+ pipeline = dlt.pipeline(
+ pipeline_name="chunked_docs",
+ destination="lancedb",
+ dataset_name="chunked_documents",
+ dev_mode=True,
+ )
+
+ initial_docs = [
+ {
+ "text": (
+ "This is the first document. It contains some text that will be chunked and"
+ " embedded. (I don't want to be seen in updated run's embedding chunk texts btw)"
+ ),
+ "doc_id": 1,
+ },
+ {
+ "text": "Here's another document. It's a bit different from the first one.",
+ "doc_id": 2,
+ },
+ ]
+
+ info = pipeline.run(documents_source(initial_docs))
+ assert_load_info(info)
+
+ updated_docs = [
+ {
+ "text": "This is the first document, but it has been updated with new content.",
+ "doc_id": 1,
+ },
+ {
+ "text": "This is a completely new document that wasn't in the initial set.",
+ "doc_id": 3,
+ },
+ ]
+
+ info = pipeline.run(documents_source(updated_docs))
+ assert_load_info(info)
+
+ with pipeline.destination_client() as client:
+ # Orphaned chunks/documents must have been discarded.
+ # Shouldn't contain any text from `initial_docs' where doc_id=1.
+ expected_text = {
+ "Here's ano",
+ "ther docum",
+ "ent. It's ",
+ "a bit diff",
+ "erent from",
+ " the first",
+ " one.",
+ "This is th",
+ "e first do",
+ "cument, bu",
+ "t it has b",
+ "een update",
+ "d with new",
+ " content.",
+ "This is a ",
+ "completely",
+ " new docum",
+ "ent that w",
+ "asn't in t",
+ "he initial",
+ " set.",
+ }
+
+ embeddings_table_name = client.make_qualified_table_name("document__embeddings") # type: ignore[attr-defined]
+
+ tbl: Table = client.db_client.open_table(embeddings_table_name) # type: ignore[attr-defined]
+ df = tbl.to_pandas()
+ assert set(df["chunk_text"]) == expected_text
+
+
search_data = [
{"text": "Frodo was a happy puppy"},
{"text": "There are several kittens playing"},
diff --git a/tests/load/lancedb/test_utils.py b/tests/load/lancedb/test_utils.py
new file mode 100644
index 0000000000..d7f9729f26
--- /dev/null
+++ b/tests/load/lancedb/test_utils.py
@@ -0,0 +1,46 @@
+import pyarrow as pa
+import pytest
+
+from dlt.destinations.impl.lancedb.utils import (
+ create_filter_condition,
+ fill_empty_source_column_values_with_placeholder,
+)
+
+
+# Mark all tests as essential, don't remove.
+pytestmark = pytest.mark.essential
+
+
+def test_fill_empty_source_column_values_with_placeholder() -> None:
+ data = [
+ pa.array(["", "hello", ""]),
+ pa.array(["hello", None, ""]),
+ pa.array([1, 2, 3]),
+ pa.array(["world", "", "arrow"]),
+ ]
+ table = pa.Table.from_arrays(data, names=["A", "B", "C", "D"])
+
+ source_columns = ["A", "B"]
+ placeholder = "placeholder"
+
+ new_table = fill_empty_source_column_values_with_placeholder(table, source_columns, placeholder)
+
+ expected_data = [
+ pa.array(["placeholder", "hello", "placeholder"]),
+ pa.array(["hello", "placeholder", "placeholder"]),
+ pa.array([1, 2, 3]),
+ pa.array(["world", "", "arrow"]),
+ ]
+ expected_table = pa.Table.from_arrays(expected_data, names=["A", "B", "C", "D"])
+ assert new_table.equals(expected_table)
+
+
+def test_create_filter_condition() -> None:
+ assert (
+ create_filter_condition("_dlt_load_id", pa.array(["A", "B", "C'c\n"]))
+ == "_dlt_load_id IN ('A', 'B', 'C''c\\n')"
+ )
+ assert (
+ create_filter_condition("_dlt_load_id", pa.array([1.2, 3, 5 / 2]))
+ == "_dlt_load_id IN (1.2, 3.0, 2.5)"
+ )
diff --git a/tests/load/lancedb/utils.py b/tests/load/lancedb/utils.py
index 7431e895b7..30430fe076 100644
--- a/tests/load/lancedb/utils.py
+++ b/tests/load/lancedb/utils.py
@@ -40,7 +40,7 @@ def assert_table(
exists = client.table_exists(qualified_table_name)
assert exists
- records = client.db_client.open_table(qualified_table_name).search().limit(50).to_list()
+ records = client.db_client.open_table(qualified_table_name).search().limit(0).to_list()
if expected_items_count is not None:
assert expected_items_count == len(records)
@@ -51,7 +51,6 @@ def assert_table(
drop_keys = [
"_dlt_id",
"_dlt_load_id",
- dlt.config.get("destination.lancedb.credentials.id_field_name", str) or "id__",
dlt.config.get("destination.lancedb.credentials.vector_field_name", str) or "vector",
]
objects_without_dlt_or_special_keys = [
@@ -72,3 +71,13 @@ def generate_embeddings(
def ndims(self) -> int:
return 2
+
+
+def mock_embed(
+ dim: int = 10,
+) -> str:
+ return str(np.random.random_sample(dim))
+
+
+def chunk_document(doc: str, chunk_size: int = 10) -> List[str]:
+ return [doc[i : i + chunk_size] for i in range(0, len(doc), chunk_size)]
diff --git a/tests/load/pipeline/test_bigquery.py b/tests/load/pipeline/test_bigquery.py
index 9d2a4abf49..cb65c6bcf1 100644
--- a/tests/load/pipeline/test_bigquery.py
+++ b/tests/load/pipeline/test_bigquery.py
@@ -360,3 +360,40 @@ def partition_date() -> Iterator[Dict[str, Any]]:
table: Table = nc.get_table(table_fqtn) # type: ignore[no-redef]
assert table.time_partitioning.field == "my_date_column"
assert table.time_partitioning.type_ == "DAY"
+
+
+@pytest.mark.parametrize(
+ "destination_config",
+ destinations_configs(default_sql_configs=True, subset=["bigquery"]),
+ ids=lambda x: x.name,
+)
+def test_adapter_autodetect_schema_with_merge(
+ destination_config: DestinationTestConfiguration,
+) -> None:
+ """simple test that merging works with autodetect schema"""
+ pipeline = destination_config.setup_pipeline(
+ "bigquery_autodetect_schema_with_merge",
+ dev_mode=True,
+ )
+
+ @dlt.resource(primary_key="id", table_name="items", write_disposition="merge")
+ def resource():
+ for _id in range(0, 5):
+ yield {"id": _id, "value": _id, "nested": [{"id": _id, "value": _id}]}
+
+ bigquery_adapter(resource, autodetect_schema=True)
+ pipeline.run(resource)
+
+ assert len(pipeline._dataset().items.df()) == 5
+ assert len(pipeline._dataset().items__nested.df()) == 5
+
+ @dlt.resource(primary_key="id", table_name="items", write_disposition="merge")
+ def resource2():
+ for _id in range(2, 7):
+ yield {"id": _id, "value": _id, "nested": [{"id": _id, "value": _id}]}
+
+ bigquery_adapter(resource2, autodetect_schema=True)
+ pipeline.run(resource2)
+
+ assert len(pipeline._dataset().items.df()) == 7
+ assert len(pipeline._dataset().items__nested.df()) == 7
diff --git a/tests/load/pipeline/test_clickhouse.py b/tests/load/pipeline/test_clickhouse.py
index 9e9c156144..7d7a821445 100644
--- a/tests/load/pipeline/test_clickhouse.py
+++ b/tests/load/pipeline/test_clickhouse.py
@@ -1,12 +1,19 @@
-from typing import Iterator
+from typing import Any, Iterator
import pytest
import dlt
+from dlt.common.destination.reference import DestinationClientDwhConfiguration
+from dlt.common.schema.schema import Schema
from dlt.common.typing import TDataItem
from dlt.common.utils import uniq_id
+from dlt.destinations.exceptions import DatabaseUndefinedRelation
from tests.load.utils import destinations_configs, DestinationTestConfiguration
from tests.pipeline.utils import load_table_counts
+from tests.utils import TEST_STORAGE_ROOT, assert_load_info
+
+# mark all tests as essential, do not remove
+pytestmark = pytest.mark.essential
@pytest.mark.parametrize(
@@ -78,3 +85,91 @@ def items2() -> Iterator[TDataItem]:
finally:
with pipeline.sql_client() as client:
client.drop_dataset()
+
+
+@pytest.mark.parametrize(
+ "destination_config",
+ destinations_configs(default_sql_configs=True, subset=["clickhouse"]),
+ ids=lambda x: x.name,
+)
+def test_clickhouse_no_dataset_name(destination_config: DestinationTestConfiguration) -> None:
+ # add staging to cover staging dataset name that must be present
+ destination_config.staging = dlt.destinations.filesystem(TEST_STORAGE_ROOT)
+ # create explicitly empty dataset
+ # NOTE: we use empty string here but when creating pipeline directly you can just skip
+ # the dataset_name argument
+ pipeline = destination_config.setup_pipeline("no_dataset_name_items", dataset_name="")
+ # run only on localhost because we have a common dataset for all tests that may run in parallel
+ dest_client, staging_client = pipeline._get_destination_clients(Schema("test"))
+
+ # make sure staging has dataset name which is pipeline default
+ assert isinstance(staging_client.config, DestinationClientDwhConfiguration)
+ assert staging_client.config.dataset_name == pipeline._make_dataset_name(None, pipeline.staging)
+
+ assert isinstance(dest_client.config, DestinationClientDwhConfiguration)
+ print(dest_client.config.dataset_name)
+ assert dest_client.config.dataset_name == ""
+
+ if dest_client.config.credentials.host != "localhost": # type: ignore[attr-defined]
+ pytest.skip("Empty dataset may be tested only on a localhost clickhouse")
+
+ @dlt.resource(name="items", write_disposition="merge", primary_key="id")
+ def items() -> Iterator[Any]:
+ yield {
+ "id": 1,
+ "name": "item",
+ "sub_items": [{"id": 101, "name": "sub item 101"}, {"id": 101, "name": "sub item 102"}],
+ }
+
+ info = pipeline.run([items], **destination_config.run_kwargs)
+ assert_load_info(info)
+
+ table_counts = load_table_counts(
+ pipeline, *[t["name"] for t in pipeline.default_schema._schema_tables.values()]
+ )
+ assert table_counts["items"] == 1
+ assert table_counts["items__sub_items"] == 2
+ assert table_counts["_dlt_loads"] == 1
+
+ # test drop storage, first create additional table and make sure it was not destroyed
+ pipeline_2 = destination_config.setup_pipeline("no_dataset_name_events", dataset_name="")
+ info = pipeline_2.run([items.with_name("events")], **destination_config.run_kwargs)
+ assert_load_info(info)
+
+ table_counts_2 = load_table_counts(
+ pipeline_2, *[t["name"] for t in pipeline_2.default_schema._schema_tables.values()]
+ )
+ assert table_counts_2["events"] == 1
+ assert table_counts_2["events__sub_items"] == 2
+ assert table_counts_2["_dlt_loads"] == 2
+
+ with pipeline.destination_client() as dest_client:
+ assert dest_client.is_storage_initialized()
+ dest_client.drop_storage()
+ assert not dest_client.is_storage_initialized()
+
+ # events are still there
+ table_counts_2 = load_table_counts(
+ pipeline_2, *[t for t in pipeline_2.default_schema.data_table_names()]
+ )
+ assert table_counts_2["events"] == 1
+ assert table_counts_2["events__sub_items"] == 2
+
+ # all other tables got dropped
+ with pytest.raises(DatabaseUndefinedRelation):
+ table_counts_2 = load_table_counts(
+ pipeline_2, *[t for t in pipeline_2.default_schema.dlt_table_names()]
+ )
+
+ # load again
+ info = pipeline_2.run([items.with_name("events")], **destination_config.run_kwargs)
+ assert_load_info(info)
+
+ table_counts_2 = load_table_counts(
+ pipeline_2, *[t["name"] for t in pipeline_2.default_schema._schema_tables.values()]
+ )
+ # merge
+ assert table_counts_2["events"] == 1
+ assert table_counts_2["events__sub_items"] == 2
+ # table got dropped so we have 1 load
+ assert table_counts_2["_dlt_loads"] == 1
diff --git a/tests/load/pipeline/test_dremio.py b/tests/load/pipeline/test_dremio.py
index f19f9f44d9..01dd86c16e 100644
--- a/tests/load/pipeline/test_dremio.py
+++ b/tests/load/pipeline/test_dremio.py
@@ -5,6 +5,9 @@
from tests.pipeline.utils import load_table_counts
from tests.load.utils import DestinationTestConfiguration, destinations_configs
+# mark all tests as essential, do not remove
+pytestmark = pytest.mark.essential
+
@pytest.mark.parametrize(
"destination_config",
diff --git a/tests/load/pipeline/test_duckdb.py b/tests/load/pipeline/test_duckdb.py
index b028edc1bb..98642bb263 100644
--- a/tests/load/pipeline/test_duckdb.py
+++ b/tests/load/pipeline/test_duckdb.py
@@ -259,3 +259,32 @@ def _get_shuffled_events(repeat: int = 1):
event_files = [m for m in metrics["job_metrics"].keys() if m.startswith("events.")]
assert len(event_files) == 5000 // 200
assert all(m.endswith("parquet") for m in event_files)
+
+
+@pytest.mark.parametrize(
+ "destination_config",
+ destinations_configs(default_sql_configs=True, subset=["duckdb"]),
+ ids=lambda x: x.name,
+)
+def test_duckdb_credentials_separation(
+ destination_config: DestinationTestConfiguration,
+) -> None:
+ p1 = dlt.pipeline("p1", destination=duckdb(credentials=":pipeline:"))
+ p2 = dlt.pipeline("p2", destination=duckdb(credentials=":pipeline:"))
+
+ p1.run([1, 2, 3], table_name="p1_data")
+ p1_dataset = p1._dataset()
+
+ p2.run([1, 2, 3], table_name="p2_data")
+ p2_dataset = p2._dataset()
+
+ # both dataset should have independent duckdb databases
+ # destinations should be bounded to pipelines still
+ print(p1_dataset.p1_data.fetchall())
+ print(p2_dataset.p2_data.fetchall())
+
+ assert "p1" in p1_dataset.sql_client.credentials._conn_str() # type: ignore[attr-defined]
+ assert "p2" in p2_dataset.sql_client.credentials._conn_str() # type: ignore[attr-defined]
+
+ assert p1_dataset.sql_client.credentials.bound_to_pipeline is p1 # type: ignore[attr-defined]
+ assert p2_dataset.sql_client.credentials.bound_to_pipeline is p2 # type: ignore[attr-defined]
diff --git a/tests/load/pipeline/test_filesystem_pipeline.py b/tests/load/pipeline/test_filesystem_pipeline.py
index b8cf66608c..8d890642ee 100644
--- a/tests/load/pipeline/test_filesystem_pipeline.py
+++ b/tests/load/pipeline/test_filesystem_pipeline.py
@@ -33,6 +33,7 @@
MEMORY_BUCKET,
FILE_BUCKET,
AZ_BUCKET,
+ SFTP_BUCKET,
)
from tests.pipeline.utils import load_table_counts, assert_load_info, load_tables_to_dicts
@@ -222,6 +223,9 @@ def some_source():
assert table.column("value").to_pylist() == [1, 2, 3, 4, 5]
+@pytest.mark.skip(
+ reason="pyarrow version check not needed anymore, since we have 17 as a dependency"
+)
def test_delta_table_pyarrow_version_check() -> None:
"""Tests pyarrow version checking for `delta` table format.
@@ -255,7 +259,7 @@ def foo():
destinations_configs(
table_format_filesystem_configs=True,
with_table_format="delta",
- bucket_exclude=(MEMORY_BUCKET),
+ bucket_exclude=(MEMORY_BUCKET, SFTP_BUCKET),
),
ids=lambda x: x.name,
)
@@ -586,6 +590,57 @@ def two_part():
assert dt.metadata().partition_columns == []
+@pytest.mark.parametrize(
+ "destination_config",
+ destinations_configs(
+ table_format_filesystem_configs=True,
+ with_table_format="delta",
+ bucket_subset=(FILE_BUCKET),
+ ),
+ ids=lambda x: x.name,
+)
+def test_delta_table_partitioning_arrow_load_id(
+ destination_config: DestinationTestConfiguration,
+) -> None:
+ """Tests partitioning on load id column added by Arrow normalizer.
+
+ Case needs special handling because of bug in delta-rs:
+ https://github.com/delta-io/delta-rs/issues/2969
+ """
+ from dlt.common.libs.pyarrow import pyarrow
+ from dlt.common.libs.deltalake import get_delta_tables
+
+ os.environ["NORMALIZE__PARQUET_NORMALIZER__ADD_DLT_LOAD_ID"] = "true"
+
+ pipeline = destination_config.setup_pipeline("fs_pipe", dev_mode=True)
+
+ # append write disposition
+ info = pipeline.run(
+ pyarrow.table({"foo": [1]}),
+ table_name="delta_table",
+ columns={"_dlt_load_id": {"partition": True}},
+ table_format="delta",
+ )
+ assert_load_info(info)
+ dt = get_delta_tables(pipeline, "delta_table")["delta_table"]
+ assert dt.metadata().partition_columns == ["_dlt_load_id"]
+ assert load_table_counts(pipeline, "delta_table")["delta_table"] == 1
+
+ # merge write disposition
+ info = pipeline.run(
+ pyarrow.table({"foo": [1, 2]}),
+ table_name="delta_table",
+ write_disposition={"disposition": "merge", "strategy": "upsert"},
+ columns={"_dlt_load_id": {"partition": True}},
+ primary_key="foo",
+ table_format="delta",
+ )
+ assert_load_info(info)
+ dt = get_delta_tables(pipeline, "delta_table")["delta_table"]
+ assert dt.metadata().partition_columns == ["_dlt_load_id"]
+ assert load_table_counts(pipeline, "delta_table")["delta_table"] == 2
+
+
@pytest.mark.essential
@pytest.mark.parametrize(
"destination_config",
@@ -931,7 +986,7 @@ def parent_delta():
destinations_configs(
table_format_filesystem_configs=True,
with_table_format="delta",
- bucket_subset=(FILE_BUCKET,),
+ bucket_subset=(FILE_BUCKET),
),
ids=lambda x: x.name,
)
diff --git a/tests/load/pipeline/test_merge_disposition.py b/tests/load/pipeline/test_merge_disposition.py
index a81c1b13e9..2925bfac6f 100644
--- a/tests/load/pipeline/test_merge_disposition.py
+++ b/tests/load/pipeline/test_merge_disposition.py
@@ -20,8 +20,10 @@
from dlt.common.schema.typing import TLoaderMergeStrategy
from dlt.common.typing import StrAny
from dlt.common.utils import digest128
-from dlt.common.destination import TDestination
+from dlt.common.destination import AnyDestination, DestinationCapabilitiesContext
from dlt.common.destination.exceptions import DestinationCapabilitiesException
+from dlt.common.libs.pyarrow import row_tuples_to_arrow
+
from dlt.extract import DltResource
from dlt.sources.helpers.transform import skip_first, take_first
from dlt.pipeline.exceptions import PipelineStepFailed
@@ -40,12 +42,13 @@
DestinationTestConfiguration,
FILE_BUCKET,
AZ_BUCKET,
+ SFTP_BUCKET,
)
def skip_if_not_supported(
merge_strategy: TLoaderMergeStrategy,
- destination: TDestination,
+ destination: AnyDestination,
) -> None:
# resolve_merge_strategy
if merge_strategy not in destination.capabilities().supported_merge_strategies:
@@ -1521,3 +1524,77 @@ def r():
assert isinstance(ex.__context__, NormalizeJobFailed)
assert isinstance(ex.__context__.__context__, CannotCoerceNullException)
+
+
+@pytest.mark.essential
+@pytest.mark.parametrize(
+ "destination_config",
+ destinations_configs(
+ default_sql_configs=True,
+ local_filesystem_configs=True,
+ table_format_filesystem_configs=True,
+ supports_merge=True,
+ bucket_subset=(FILE_BUCKET),
+ ),
+ ids=lambda x: x.name,
+)
+@pytest.mark.parametrize("merge_strategy", ("delete-insert", "upsert"))
+def test_merge_arrow(
+ destination_config: DestinationTestConfiguration,
+ merge_strategy: TLoaderMergeStrategy,
+) -> None:
+ pipeline = destination_config.setup_pipeline("merge_arrow", dev_mode=True)
+
+ skip_if_not_supported(merge_strategy, pipeline.destination)
+
+ @dlt.resource(
+ write_disposition={"disposition": "merge", "strategy": merge_strategy},
+ primary_key="id",
+ table_format=destination_config.table_format,
+ )
+ def arrow_items(rows, schema_columns, timezone="UTC"):
+ yield row_tuples_to_arrow(
+ rows,
+ DestinationCapabilitiesContext.generic_capabilities(),
+ columns=schema_columns,
+ tz=timezone,
+ )
+
+ schema_columns = {
+ "id": {"name": "id", "nullable": False, "data_type": "bigint"},
+ "name": {"name": "name", "nullable": True, "data_type": "text"},
+ }
+ test_rows = [(1, "foo"), (2, "bar")]
+
+ load_info = pipeline.run(
+ arrow_items(test_rows, schema_columns),
+ )
+ assert_load_info(load_info)
+
+ tables = load_tables_to_dicts(pipeline, "arrow_items")
+
+ assert_records_as_set(
+ tables["arrow_items"],
+ [
+ {"id": 1, "name": "foo"},
+ {"id": 2, "name": "bar"},
+ ],
+ )
+
+ # Update the records
+ test_rows = [(1, "foo"), (2, "updated bar")]
+
+ load_info = pipeline.run(
+ arrow_items(test_rows, schema_columns),
+ )
+
+ assert_load_info(load_info)
+ tables = load_tables_to_dicts(pipeline, "arrow_items", "arrow_items")
+
+ assert_records_as_set(
+ tables["arrow_items"],
+ [
+ {"id": 1, "name": "foo"},
+ {"id": 2, "name": "updated bar"},
+ ],
+ )
diff --git a/tests/load/pipeline/test_pipelines.py b/tests/load/pipeline/test_pipelines.py
index d064456c0d..9190225a8c 100644
--- a/tests/load/pipeline/test_pipelines.py
+++ b/tests/load/pipeline/test_pipelines.py
@@ -71,7 +71,7 @@ def test_default_pipeline_names(
possible_dataset_names = ["dlt_pytest_dataset", "dlt_pipeline_dataset"]
assert p.pipeline_name in possible_names
assert p.pipelines_dir == os.path.abspath(os.path.join(TEST_STORAGE_ROOT, ".dlt", "pipelines"))
- assert p.dataset_name in possible_dataset_names
+ assert p.dataset_name is None
assert p.destination is None
assert p.default_schema_name is None
@@ -107,13 +107,19 @@ def data_fun() -> Iterator[Any]:
else None
),
)
- # does not reset the dataset name
- assert p.dataset_name in possible_dataset_names
- # never do that in production code
- p.dataset_name = None
- # set no dataset name -> if destination does not support it we revert to default
p._set_dataset_name(None)
- assert p.dataset_name in possible_dataset_names
+
+ if p.destination.spec().needs_dataset_name(): # type: ignore
+ # sets dataset names for destinations that require it
+ assert p.dataset_name in possible_dataset_names
+ # never do that in production code
+ p.dataset_name = None
+ # set no dataset name -> if destination does not support it we revert to default
+ p._set_dataset_name(None)
+ assert p.dataset_name in possible_dataset_names
+ else:
+ # does not need dataset
+ assert p.dataset_name is None
# the last package contains just the state (we added a new schema)
last_load_id = p.list_extracted_load_packages()[-1]
state_package = p.get_load_package_info(last_load_id)
@@ -435,18 +441,26 @@ def test_pipeline_data_writer_compression(
# Ensure pipeline works without compression
data = ["a", "b", "c"]
dataset_name = "compression_data_" + uniq_id()
- dlt.config["data_writer"] = {
- "disable_compression": disable_compression
- } # not sure how else to set this
+ destination_config.disable_compression = disable_compression
+
p = destination_config.setup_pipeline("compression_test", dataset_name=dataset_name)
p.extract(dlt.resource(data, name="data"), table_format=destination_config.table_format)
s = p._get_normalize_storage()
# check that files are not compressed if compression is disabled
- if disable_compression:
- for f in s.list_files_to_normalize_sorted():
+ for name in s.list_files_to_normalize_sorted():
+ full_path = s.extracted_packages.storage.make_full_path(name)
+ if disable_compression:
with pytest.raises(gzip.BadGzipFile):
- gzip.open(s.extracted_packages.storage.make_full_path(f), "rb").read()
+ with gzip.open(full_path, "rb") as f:
+ f.read()
+ else:
+ # wont' decode zip file as utf
+ with pytest.raises(UnicodeDecodeError):
+ with open(full_path, "rt", encoding="utf-8") as f:
+ f.readline()
+
p.normalize(loader_file_format=destination_config.file_format)
+
info = p.load()
assert_table(p, "data", data, info=info)
diff --git a/tests/load/sources/sql_database/test_sql_database_source.py b/tests/load/sources/sql_database/test_sql_database_source.py
index 9d03cd478c..069ebd7841 100644
--- a/tests/load/sources/sql_database/test_sql_database_source.py
+++ b/tests/load/sources/sql_database/test_sql_database_source.py
@@ -793,7 +793,7 @@ def test_destination_caps_context(sql_source_db: SQLAlchemySourceDB, backend: Ta
columns = pipeline.default_schema.get_table("has_precision")["columns"]
assert columns["datetime_tz_col"]["precision"] == columns["datetime_ntz_col"]["precision"] == 3
# prevent drop
- pipeline.destination = None
+ pipeline._destination = None
@pytest.mark.parametrize("backend", ["sqlalchemy", "pyarrow", "pandas", "connectorx"])
diff --git a/tests/load/test_dummy_client.py b/tests/load/test_dummy_client.py
index 26b90e5a0d..e27000c841 100644
--- a/tests/load/test_dummy_client.py
+++ b/tests/load/test_dummy_client.py
@@ -11,7 +11,7 @@
from dlt.common.storages.configuration import FilesystemConfiguration
from dlt.common.storages.load_package import TPackageJobState
from dlt.common.storages.load_storage import JobFileFormatUnsupported
-from dlt.common.destination.reference import RunnableLoadJob, TDestination
+from dlt.common.destination.reference import RunnableLoadJob, AnyDestination
from dlt.common.schema.utils import (
fill_hints_from_parent_and_clone_table,
get_nested_tables,
@@ -130,7 +130,7 @@ def test_big_loadpackages() -> None:
duration = float(time() - start_time)
# sanity check
- assert duration > 3
+ assert duration > 2
# we want 1000 empty processed jobs to need less than 15 seconds total (locally it runs in 5)
assert duration < 15
@@ -1064,7 +1064,7 @@ def setup_loader(
client_config = client_config or DummyClientConfiguration(
loader_file_format="jsonl", completed_prob=1
)
- destination: TDestination = dummy(**client_config) # type: ignore[assignment]
+ destination: AnyDestination = dummy(**client_config) # type: ignore[assignment]
# setup
staging_system_config = None
staging = None
diff --git a/tests/load/test_read_interfaces.py b/tests/load/test_read_interfaces.py
index ef73cbd509..c6019ecf2d 100644
--- a/tests/load/test_read_interfaces.py
+++ b/tests/load/test_read_interfaces.py
@@ -6,7 +6,6 @@
from dlt import Pipeline
from dlt.common import Decimal
-from dlt.common.utils import uniq_id
from typing import List
from functools import reduce
@@ -21,40 +20,58 @@
from dlt.destinations import filesystem
from tests.utils import TEST_STORAGE_ROOT
from dlt.common.destination.reference import TDestinationReferenceArg
-from dlt.destinations.dataset import ReadableDBAPIDataset
-
-
-def _run_dataset_checks(
- pipeline: Pipeline,
- destination_config: DestinationTestConfiguration,
- table_format: Any = None,
- alternate_access_pipeline: Pipeline = None,
-) -> None:
- destination_type = pipeline.destination_client().config.destination_type
-
- skip_df_chunk_size_check = False
- expected_columns = ["id", "decimal", "other_decimal", "_dlt_load_id", "_dlt_id"]
- if destination_type == "bigquery":
- chunk_size = 50
- total_records = 80
- elif destination_type == "mssql":
- chunk_size = 700
- total_records = 1000
- else:
- chunk_size = 2048
- total_records = 3000
-
- # on filesystem one chunk is one file and not the default vector size
- if destination_type == "filesystem":
- skip_df_chunk_size_check = True
-
- # we always expect 2 chunks based on the above setup
- expected_chunk_counts = [chunk_size, total_records - chunk_size]
+from dlt.destinations.dataset import ReadableDBAPIDataset, ReadableRelationUnknownColumnException
+from tests.load.utils import drop_pipeline_data
+
+EXPECTED_COLUMNS = ["id", "decimal", "other_decimal", "_dlt_load_id", "_dlt_id"]
+
+
+def _total_records(p: Pipeline) -> int:
+ """how many records to load for a given pipeline"""
+ if p.destination.destination_type == "dlt.destinations.bigquery":
+ return 80
+ elif p.destination.destination_type == "dlt.destinations.mssql":
+ return 1000
+ return 3000
+
+
+def _chunk_size(p: Pipeline) -> int:
+ """chunk size for a given pipeline"""
+ if p.destination.destination_type == "dlt.destinations.bigquery":
+ return 50
+ elif p.destination.destination_type == "dlt.destinations.mssql":
+ return 700
+ return 2048
+
+
+def _expected_chunk_count(p: Pipeline) -> List[int]:
+ return [_chunk_size(p), _total_records(p) - _chunk_size(p)]
+
+
+@pytest.fixture(scope="session")
+def populated_pipeline(request) -> Any:
+ """fixture that returns a pipeline object populated with the example data"""
+ destination_config = cast(DestinationTestConfiguration, request.param)
+
+ if (
+ destination_config.file_format not in ["parquet", "jsonl"]
+ and destination_config.destination_type == "filesystem"
+ ):
+ pytest.skip(
+ "Test only works for jsonl and parquet on filesystem destination, given:"
+ f" {destination_config.file_format}"
+ )
+
+ pipeline = destination_config.setup_pipeline(
+ "read_pipeline", dataset_name="read_test", dev_mode=True
+ )
+ os.environ["DATA_WRITER__FILE_MAX_ITEMS"] = "700"
+ total_records = _total_records(pipeline)
@dlt.source()
def source():
@dlt.resource(
- table_format=table_format,
+ table_format=destination_config.table_format,
write_disposition="replace",
columns={
"id": {"data_type": "bigint"},
@@ -75,7 +92,7 @@ def items():
]
@dlt.resource(
- table_format=table_format,
+ table_format=destination_config.table_format,
write_disposition="replace",
columns={
"id": {"data_type": "bigint"},
@@ -97,42 +114,49 @@ def double_items():
s = source()
pipeline.run(s, loader_file_format=destination_config.file_format)
- if alternate_access_pipeline:
- pipeline.destination = alternate_access_pipeline.destination
-
- # access via key
- table_relationship = pipeline._dataset()["items"]
+ # in case of delta on gcs we use the s3 compat layer for reading
+ # for writing we still need to use the gc authentication, as delta_rs seems to use
+ # methods on the s3 interface that are not implemented by gcs
+ if destination_config.bucket_url == GCS_BUCKET and destination_config.table_format == "delta":
+ gcp_bucket = filesystem(
+ GCS_BUCKET.replace("gs://", "s3://"), destination_name="filesystem_s3_gcs_comp"
+ )
+ access_pipeline = destination_config.setup_pipeline(
+ "read_pipeline", dataset_name="read_test", destination=gcp_bucket
+ )
- # full frame
- df = table_relationship.df()
- assert len(df.index) == total_records
+ pipeline.destination = access_pipeline.destination
- #
- # check dataframes
- #
+ # return pipeline to test
+ yield pipeline
- # chunk
- df = table_relationship.df(chunk_size=chunk_size)
- if not skip_df_chunk_size_check:
- assert len(df.index) == chunk_size
- # lowercase results for the snowflake case
- assert set(df.columns.values) == set(expected_columns)
+ # NOTE: we need to drop pipeline data here since we are keeping the pipelines around for the whole module
+ drop_pipeline_data(pipeline)
- # iterate all dataframes
- frames = list(table_relationship.iter_df(chunk_size=chunk_size))
- if not skip_df_chunk_size_check:
- assert [len(df.index) for df in frames] == expected_chunk_counts
- # check all items are present
- ids = reduce(lambda a, b: a + b, [f[expected_columns[0]].to_list() for f in frames])
- assert set(ids) == set(range(total_records))
+# NOTE: we collect all destination configs centrally, this way the session based
+# pipeline population per fixture setup will work and save a lot of time
+configs = destinations_configs(
+ default_sql_configs=True,
+ all_buckets_filesystem_configs=True,
+ table_format_filesystem_configs=True,
+ bucket_exclude=[SFTP_BUCKET, MEMORY_BUCKET],
+)
- # access via prop
- table_relationship = pipeline._dataset().items
- #
- # check arrow tables
- #
+@pytest.mark.no_load
+@pytest.mark.essential
+@pytest.mark.parametrize(
+ "populated_pipeline",
+ configs,
+ indirect=True,
+ ids=lambda x: x.name,
+)
+def test_arrow_access(populated_pipeline: Pipeline) -> None:
+ table_relationship = populated_pipeline._dataset().items
+ total_records = _total_records(populated_pipeline)
+ chunk_size = _chunk_size(populated_pipeline)
+ expected_chunk_counts = _expected_chunk_count(populated_pipeline)
# full table
table = table_relationship.arrow()
@@ -140,7 +164,7 @@ def double_items():
# chunk
table = table_relationship.arrow(chunk_size=chunk_size)
- assert set(table.column_names) == set(expected_columns)
+ assert set(table.column_names) == set(EXPECTED_COLUMNS)
assert table.num_rows == chunk_size
# check frame amount and items counts
@@ -148,11 +172,64 @@ def double_items():
assert [t.num_rows for t in tables] == expected_chunk_counts
# check all items are present
- ids = reduce(lambda a, b: a + b, [t.column(expected_columns[0]).to_pylist() for t in tables])
+ ids = reduce(lambda a, b: a + b, [t.column(EXPECTED_COLUMNS[0]).to_pylist() for t in tables])
assert set(ids) == set(range(total_records))
+
+@pytest.mark.no_load
+@pytest.mark.essential
+@pytest.mark.parametrize(
+ "populated_pipeline",
+ configs,
+ indirect=True,
+ ids=lambda x: x.name,
+)
+def test_dataframe_access(populated_pipeline: Pipeline) -> None:
+ # access via key
+ table_relationship = populated_pipeline._dataset()["items"]
+ total_records = _total_records(populated_pipeline)
+ chunk_size = _chunk_size(populated_pipeline)
+ expected_chunk_counts = _expected_chunk_count(populated_pipeline)
+ skip_df_chunk_size_check = (
+ populated_pipeline.destination.destination_type == "dlt.destinations.filesystem"
+ )
+
+ # full frame
+ df = table_relationship.df()
+ assert len(df.index) == total_records
+
+ # chunk
+ df = table_relationship.df(chunk_size=chunk_size)
+ if not skip_df_chunk_size_check:
+ assert len(df.index) == chunk_size
+
+ # lowercase results for the snowflake case
+ assert set(df.columns.values) == set(EXPECTED_COLUMNS)
+
+ # iterate all dataframes
+ frames = list(table_relationship.iter_df(chunk_size=chunk_size))
+ if not skip_df_chunk_size_check:
+ assert [len(df.index) for df in frames] == expected_chunk_counts
+
+ # check all items are present
+ ids = reduce(lambda a, b: a + b, [f[EXPECTED_COLUMNS[0]].to_list() for f in frames])
+ assert set(ids) == set(range(total_records))
+
+
+@pytest.mark.no_load
+@pytest.mark.essential
+@pytest.mark.parametrize(
+ "populated_pipeline",
+ configs,
+ indirect=True,
+ ids=lambda x: x.name,
+)
+def test_db_cursor_access(populated_pipeline: Pipeline) -> None:
# check fetch accessors
- table_relationship = pipeline._dataset().items
+ table_relationship = populated_pipeline._dataset().items
+ total_records = _total_records(populated_pipeline)
+ chunk_size = _chunk_size(populated_pipeline)
+ expected_chunk_counts = _expected_chunk_count(populated_pipeline)
# check accessing one item
one = table_relationship.fetchone()
@@ -173,10 +250,21 @@ def double_items():
ids = reduce(lambda a, b: a + b, [[item[0] for item in chunk] for chunk in chunks])
assert set(ids) == set(range(total_records))
+
+@pytest.mark.no_load
+@pytest.mark.essential
+@pytest.mark.parametrize(
+ "populated_pipeline",
+ configs,
+ indirect=True,
+ ids=lambda x: x.name,
+)
+def test_hint_preservation(populated_pipeline: Pipeline) -> None:
+ table_relationship = populated_pipeline._dataset().items
# check that hints are carried over to arrow table
expected_decimal_precision = 10
expected_decimal_precision_2 = 12
- if destination_config.destination_type == "bigquery":
+ if populated_pipeline.destination.destination_type == "dlt.destinations.bigquery":
# bigquery does not allow precision configuration..
expected_decimal_precision = 38
expected_decimal_precision_2 = 38
@@ -189,39 +277,123 @@ def double_items():
== expected_decimal_precision_2
)
+
+@pytest.mark.no_load
+@pytest.mark.essential
+@pytest.mark.parametrize(
+ "populated_pipeline",
+ configs,
+ indirect=True,
+ ids=lambda x: x.name,
+)
+def test_loads_table_access(populated_pipeline: Pipeline) -> None:
+ # check loads table access, we should have one entry
+ loads_table = populated_pipeline._dataset()[populated_pipeline.default_schema.loads_table_name]
+ assert len(loads_table.fetchall()) == 1
+
+
+@pytest.mark.no_load
+@pytest.mark.essential
+@pytest.mark.parametrize(
+ "populated_pipeline",
+ configs,
+ indirect=True,
+ ids=lambda x: x.name,
+)
+def test_sql_queries(populated_pipeline: Pipeline) -> None:
# simple check that query also works
- tname = pipeline.sql_client().make_qualified_table_name("items")
- query_relationship = pipeline._dataset()(f"select * from {tname} where id < 20")
+ tname = populated_pipeline.sql_client().make_qualified_table_name("items")
+ query_relationship = populated_pipeline._dataset()(f"select * from {tname} where id < 20")
# we selected the first 20
table = query_relationship.arrow()
assert table.num_rows == 20
# check join query
- tdname = pipeline.sql_client().make_qualified_table_name("double_items")
+ tdname = populated_pipeline.sql_client().make_qualified_table_name("double_items")
query = (
f"SELECT i.id, di.double_id FROM {tname} as i JOIN {tdname} as di ON (i.id = di.id) WHERE"
" i.id < 20 ORDER BY i.id ASC"
)
- join_relationship = pipeline._dataset()(query)
+ join_relationship = populated_pipeline._dataset()(query)
table = join_relationship.fetchall()
assert len(table) == 20
assert list(table[0]) == [0, 0]
assert list(table[5]) == [5, 10]
assert list(table[10]) == [10, 20]
- # check loads table access
- loads_table = pipeline._dataset()[pipeline.default_schema.loads_table_name]
- loads_table.fetchall()
- destination_for_dataset: TDestinationReferenceArg = (
- alternate_access_pipeline.destination
- if alternate_access_pipeline
- else destination_config.destination_type
- )
+@pytest.mark.no_load
+@pytest.mark.essential
+@pytest.mark.parametrize(
+ "populated_pipeline",
+ configs,
+ indirect=True,
+ ids=lambda x: x.name,
+)
+def test_limit_and_head(populated_pipeline: Pipeline) -> None:
+ table_relationship = populated_pipeline._dataset().items
+
+ assert len(table_relationship.head().fetchall()) == 5
+ assert len(table_relationship.limit(24).fetchall()) == 24
+
+ assert len(table_relationship.head().df().index) == 5
+ assert len(table_relationship.limit(24).df().index) == 24
+
+ assert table_relationship.head().arrow().num_rows == 5
+ assert table_relationship.limit(24).arrow().num_rows == 24
+
+
+@pytest.mark.no_load
+@pytest.mark.essential
+@pytest.mark.parametrize(
+ "populated_pipeline",
+ configs,
+ indirect=True,
+ ids=lambda x: x.name,
+)
+def test_column_selection(populated_pipeline: Pipeline) -> None:
+ table_relationship = populated_pipeline._dataset().items
+
+ columns = ["_dlt_load_id", "other_decimal"]
+ data_frame = table_relationship.select(*columns).head().df()
+ assert [v.lower() for v in data_frame.columns.values] == columns
+ assert len(data_frame.index) == 5
+
+ columns = ["decimal", "other_decimal"]
+ arrow_table = table_relationship[columns].head().arrow()
+ assert arrow_table.column_names == columns
+ assert arrow_table.num_rows == 5
+
+ # hints should also be preserved via computed reduced schema
+ expected_decimal_precision = 10
+ expected_decimal_precision_2 = 12
+ if populated_pipeline.destination.destination_type == "dlt.destinations.bigquery":
+ # bigquery does not allow precision configuration..
+ expected_decimal_precision = 38
+ expected_decimal_precision_2 = 38
+ assert arrow_table.schema.field("decimal").type.precision == expected_decimal_precision
+ assert arrow_table.schema.field("other_decimal").type.precision == expected_decimal_precision_2
+
+ with pytest.raises(ReadableRelationUnknownColumnException):
+ arrow_table = table_relationship.select("unknown_column").head().arrow()
+
+
+@pytest.mark.no_load
+@pytest.mark.essential
+@pytest.mark.parametrize(
+ "populated_pipeline",
+ configs,
+ indirect=True,
+ ids=lambda x: x.name,
+)
+def test_standalone_dataset(populated_pipeline: Pipeline) -> None:
+ total_records = _total_records(populated_pipeline)
# check dataset factory
- dataset = dlt._dataset(destination=destination_for_dataset, dataset_name=pipeline.dataset_name)
+ dataset = dlt._dataset(
+ destination=populated_pipeline.destination, dataset_name=populated_pipeline.dataset_name
+ )
# verfiy that sql client and schema are lazy loaded
assert not dataset._schema
assert not dataset._sql_client
@@ -233,9 +405,9 @@ def double_items():
dataset = cast(
ReadableDBAPIDataset,
dlt._dataset(
- destination=destination_for_dataset,
- dataset_name=pipeline.dataset_name,
- schema=pipeline.default_schema_name,
+ destination=populated_pipeline.destination,
+ dataset_name=populated_pipeline.dataset_name,
+ schema=populated_pipeline.default_schema_name,
),
)
assert dataset.schema.tables["items"]["write_disposition"] == "replace"
@@ -244,30 +416,30 @@ def double_items():
dataset = cast(
ReadableDBAPIDataset,
dlt._dataset(
- destination=destination_for_dataset,
- dataset_name=pipeline.dataset_name,
+ destination=populated_pipeline.destination,
+ dataset_name=populated_pipeline.dataset_name,
schema="wrong_schema_name",
),
)
assert "items" not in dataset.schema.tables
- assert dataset.schema.name == pipeline.dataset_name
+ assert dataset.schema.name == populated_pipeline.dataset_name
# check that schema is loaded if no schema name given
dataset = cast(
ReadableDBAPIDataset,
dlt._dataset(
- destination=destination_for_dataset,
- dataset_name=pipeline.dataset_name,
+ destination=populated_pipeline.destination,
+ dataset_name=populated_pipeline.dataset_name,
),
)
- assert dataset.schema.name == pipeline.default_schema_name
+ assert dataset.schema.name == populated_pipeline.default_schema_name
assert dataset.schema.tables["items"]["write_disposition"] == "replace"
# check that there is no error when creating dataset without schema table
dataset = cast(
ReadableDBAPIDataset,
dlt._dataset(
- destination=destination_for_dataset,
+ destination=populated_pipeline.destination,
dataset_name="unknown_dataset",
),
)
@@ -281,105 +453,17 @@ def double_items():
other_schema = Schema("some_other_schema")
other_schema.tables["other_table"] = utils.new_table("other_table")
- pipeline._inject_schema(other_schema)
- pipeline.default_schema_name = other_schema.name
- with pipeline.destination_client() as client:
+ populated_pipeline._inject_schema(other_schema)
+ populated_pipeline.default_schema_name = other_schema.name
+ with populated_pipeline.destination_client() as client:
client.update_stored_schema()
dataset = cast(
ReadableDBAPIDataset,
dlt._dataset(
- destination=destination_for_dataset,
- dataset_name=pipeline.dataset_name,
+ destination=populated_pipeline.destination,
+ dataset_name=populated_pipeline.dataset_name,
),
)
assert dataset.schema.name == "some_other_schema"
assert "other_table" in dataset.schema.tables
-
-
-@pytest.mark.essential
-@pytest.mark.parametrize(
- "destination_config",
- destinations_configs(default_sql_configs=True),
- ids=lambda x: x.name,
-)
-def test_read_interfaces_sql(destination_config: DestinationTestConfiguration) -> None:
- pipeline = destination_config.setup_pipeline(
- "read_pipeline", dataset_name="read_test", dev_mode=True
- )
- _run_dataset_checks(pipeline, destination_config)
-
-
-@pytest.mark.essential
-@pytest.mark.parametrize(
- "destination_config",
- destinations_configs(
- local_filesystem_configs=True,
- all_buckets_filesystem_configs=True,
- bucket_exclude=[SFTP_BUCKET, MEMORY_BUCKET],
- ), # TODO: make SFTP work
- ids=lambda x: x.name,
-)
-def test_read_interfaces_filesystem(destination_config: DestinationTestConfiguration) -> None:
- # we force multiple files per table, they may only hold 700 items
- os.environ["DATA_WRITER__FILE_MAX_ITEMS"] = "700"
-
- if destination_config.file_format not in ["parquet", "jsonl"]:
- pytest.skip(
- f"Test only works for jsonl and parquet, given: {destination_config.file_format}"
- )
-
- pipeline = destination_config.setup_pipeline(
- "read_pipeline",
- dataset_name="read_test",
- dev_mode=True,
- )
-
- _run_dataset_checks(pipeline, destination_config)
-
- # for gcs buckets we additionally test the s3 compat layer
- if destination_config.bucket_url == GCS_BUCKET:
- gcp_bucket = filesystem(
- GCS_BUCKET.replace("gs://", "s3://"), destination_name="filesystem_s3_gcs_comp"
- )
- pipeline = destination_config.setup_pipeline(
- "read_pipeline", dataset_name="read_test", dev_mode=True, destination=gcp_bucket
- )
- _run_dataset_checks(pipeline, destination_config)
-
-
-@pytest.mark.essential
-@pytest.mark.parametrize(
- "destination_config",
- destinations_configs(
- table_format_filesystem_configs=True,
- with_table_format="delta",
- bucket_exclude=[SFTP_BUCKET, MEMORY_BUCKET],
- ),
- ids=lambda x: x.name,
-)
-def test_delta_tables(destination_config: DestinationTestConfiguration) -> None:
- os.environ["DATA_WRITER__FILE_MAX_ITEMS"] = "700"
-
- pipeline = destination_config.setup_pipeline(
- "read_pipeline", dataset_name="read_test", dev_mode=True
- )
-
- # in case of gcs we use the s3 compat layer for reading
- # for writing we still need to use the gc authentication, as delta_rs seems to use
- # methods on the s3 interface that are not implemented by gcs
- access_pipeline = pipeline
- if destination_config.bucket_url == GCS_BUCKET:
- gcp_bucket = filesystem(
- GCS_BUCKET.replace("gs://", "s3://"), destination_name="filesystem_s3_gcs_comp"
- )
- access_pipeline = destination_config.setup_pipeline(
- "read_pipeline", dataset_name="read_test", destination=gcp_bucket
- )
-
- _run_dataset_checks(
- pipeline,
- destination_config,
- table_format="delta",
- alternate_access_pipeline=access_pipeline,
- )
diff --git a/tests/load/test_sql_client.py b/tests/load/test_sql_client.py
index 0aaa18eac1..05c10a900f 100644
--- a/tests/load/test_sql_client.py
+++ b/tests/load/test_sql_client.py
@@ -152,7 +152,9 @@ def test_create_drop_dataset(naming: str, client: SqlJobClientBase) -> None:
# Dataset is already create in fixture, so next time it fails
with pytest.raises(DatabaseException):
client.sql_client.create_dataset()
+ assert client.is_storage_initialized() is True
client.sql_client.drop_dataset()
+ assert client.is_storage_initialized() is False
with pytest.raises(DatabaseUndefinedRelation):
client.sql_client.drop_dataset()
diff --git a/tests/load/utils.py b/tests/load/utils.py
index 575938af15..5c24b2d1dc 100644
--- a/tests/load/utils.py
+++ b/tests/load/utils.py
@@ -120,6 +120,7 @@
aws_access_key_id=dlt.config.get("tests.r2_aws_access_key_id", str),
aws_secret_access_key=dlt.config.get("tests.r2_aws_secret_access_key", str),
endpoint_url=dlt.config.get("tests.r2_endpoint_url", str),
+ region_name=dlt.config.get("tests.r2_region_name", str),
),
)
@@ -160,7 +161,7 @@ class DestinationTestConfiguration:
supports_merge: bool = True # TODO: take it from client base class
force_iceberg: bool = None # used only to test deprecation
supports_dbt: bool = True
- disable_compression: bool = False
+ disable_compression: bool = None # use default value
dev_mode: bool = False
credentials: Optional[Union[CredentialsConfiguration, Dict[str, Any], str]] = None
env_vars: Optional[Dict[str, str]] = None
@@ -211,8 +212,11 @@ def setup(self) -> None:
os.environ[f"DESTINATION__{k.upper()}"] = str(v)
# For the filesystem destinations we disable compression to make analyzing the result easier
- if self.destination_type == "filesystem" or self.disable_compression:
- os.environ["DATA_WRITER__DISABLE_COMPRESSION"] = "True"
+ os.environ["DATA_WRITER__DISABLE_COMPRESSION"] = str(
+ self.destination_type == "filesystem"
+ if self.disable_compression is None
+ else self.disable_compression
+ )
if self.credentials is not None:
if isinstance(self.credentials, str):
@@ -240,7 +244,7 @@ def setup_pipeline(
pipeline_name=pipeline_name,
destination=destination,
staging=kwargs.pop("staging", self.staging),
- dataset_name=dataset_name or pipeline_name,
+ dataset_name=dataset_name if dataset_name is not None else pipeline_name,
dev_mode=dev_mode,
**kwargs,
)
@@ -371,6 +375,7 @@ def destinations_configs(
file_format="parquet",
bucket_url=AWS_BUCKET,
supports_dbt=False,
+ extra_info="minio",
)
]
destination_configs += [
@@ -436,7 +441,7 @@ def destinations_configs(
file_format="jsonl",
bucket_url=AWS_BUCKET,
stage_name="PUBLIC.dlt_s3_stage",
- extra_info="s3-integration",
+ extra_info="s3-integration-public-stage",
),
DestinationTestConfiguration(
destination_type="snowflake",
@@ -512,13 +517,6 @@ def destinations_configs(
bucket_url=AWS_BUCKET,
extra_info="s3-authorization",
),
- DestinationTestConfiguration(
- destination_type="dremio",
- staging=filesystem(destination_name="minio"),
- file_format="parquet",
- bucket_url=AWS_BUCKET,
- supports_dbt=False,
- ),
]
if all_staging_configs:
@@ -606,9 +604,10 @@ def destinations_configs(
DestinationTestConfiguration(
destination_type="filesystem",
bucket_url=bucket,
- extra_info=bucket,
+ extra_info=bucket + "-delta",
table_format="delta",
supports_merge=True,
+ file_format="parquet",
env_vars=(
{
"DESTINATION__FILESYSTEM__DELTALAKE_STORAGE_OPTIONS": (
@@ -706,40 +705,44 @@ def drop_pipeline(request, preserve_environ) -> Iterator[None]:
pass
-def drop_active_pipeline_data() -> None:
- """Drops all the datasets for currently active pipeline, wipes the working folder and then deactivated it."""
- if Container()[PipelineContext].is_active():
- try:
- # take existing pipeline
- p = dlt.pipeline()
+def drop_pipeline_data(p: dlt.Pipeline) -> None:
+ """Drops all the datasets for a given pipeline"""
- def _drop_dataset(schema_name: str) -> None:
- with p.destination_client(schema_name) as client:
+ def _drop_dataset(schema_name: str) -> None:
+ with p.destination_client(schema_name) as client:
+ try:
+ client.drop_storage()
+ print("dropped")
+ except Exception as exc:
+ print(exc)
+ if isinstance(client, WithStagingDataset):
+ with client.with_staging_dataset():
try:
client.drop_storage()
- print("dropped")
+ print("staging dropped")
except Exception as exc:
print(exc)
- if isinstance(client, WithStagingDataset):
- with client.with_staging_dataset():
- try:
- client.drop_storage()
- print("staging dropped")
- except Exception as exc:
- print(exc)
-
- # drop_func = _drop_dataset_fs if _is_filesystem(p) else _drop_dataset_sql
- # take all schemas and if destination was set
- if p.destination:
- if p.config.use_single_dataset:
- # drop just the dataset for default schema
- if p.default_schema_name:
- _drop_dataset(p.default_schema_name)
- else:
- # for each schema, drop the dataset
- for schema_name in p.schema_names:
- _drop_dataset(schema_name)
+ # drop_func = _drop_dataset_fs if _is_filesystem(p) else _drop_dataset_sql
+ # take all schemas and if destination was set
+ if p.destination:
+ if p.config.use_single_dataset:
+ # drop just the dataset for default schema
+ if p.default_schema_name:
+ _drop_dataset(p.default_schema_name)
+ else:
+ # for each schema, drop the dataset
+ for schema_name in p.schema_names:
+ _drop_dataset(schema_name)
+
+
+def drop_active_pipeline_data() -> None:
+ """Drops all the datasets for currently active pipeline, wipes the working folder and then deactivated it."""
+ if Container()[PipelineContext].is_active():
+ try:
+ # take existing pipeline
+ p = dlt.pipeline()
+ drop_pipeline_data(p)
# p._wipe_working_folder()
finally:
# always deactivate context, working directory will be wiped when the next test starts
diff --git a/tests/normalize/test_max_nesting.py b/tests/normalize/test_max_nesting.py
index fb2b2d70f6..20a926cf42 100644
--- a/tests/normalize/test_max_nesting.py
+++ b/tests/normalize/test_max_nesting.py
@@ -1,371 +1,229 @@
-from typing import Any, Dict, List
+from typing import List
import dlt
import pytest
-from dlt.common import json
from dlt.destinations import dummy
-from tests.common.utils import json_case_path
-ROOT_TABLES = ["bot_events"]
+example_data = {"one": [{"two": [{"three": [{"four": [{"five": "value"}]}]}]}]}
+example_data_with_alternative_tree = {
+ "one_alternative": [{"two": [{"three": [{"four": [{"five": "value"}]}]}]}]
+}
-ALL_TABLES_FOR_RASA_EVENT = [
- "bot_events",
- "bot_events__metadata__known_recipients",
- "bot_events__metadata__transaction_history__spend__target",
- "bot_events__metadata__transaction_history__spend__starbucks",
- "bot_events__metadata__transaction_history__spend__amazon",
- "bot_events__metadata__transaction_history__deposit__employer",
- "bot_events__metadata__transaction_history__deposit__interest",
- "bot_events__metadata__vendor_list",
+NESTING_LEVEL_0 = [""]
+NESTING_LEVEL_1 = [
+ "",
+ "__one",
]
-
-ALL_TABLES_FOR_RASA_EVENT_NESTING_LEVEL_2 = [
- "bot_events",
- "bot_events__metadata__known_recipients",
- "bot_events__metadata__vendor_list",
+NESTING_LEVEL_2 = ["", "__one", "__one__two"]
+NESTING_LEVEL_3 = [
+ "",
+ "__one",
+ "__one__two",
+ "__one__two__three",
+]
+NESTING_LEVEL_4 = [
+ "",
+ "__one",
+ "__one__two",
+ "__one__two__three",
+ "__one__two__three__four",
]
-@pytest.fixture(scope="module")
-def rasa_event_bot_metadata():
- with open(json_case_path("rasa_event_bot_metadata"), "rb") as f:
- return json.load(f)
+def _table_names_for_base_table(
+ _name: str, tables: List[str], alternative_tree: bool = False
+) -> List[str]:
+ tables = [f"{_name}{table}" for table in tables]
+ if alternative_tree:
+ tables = [t.replace("__one", "__one_alternative") for t in tables]
+ return tables
+
+
+def _get_pipeline() -> dlt.Pipeline:
+ return dlt.pipeline(
+ pipeline_name="test_max_table_nesting",
+ destination=dummy(timeout=0.1, completed_prob=1),
+ dev_mode=True,
+ )
@pytest.mark.parametrize(
- "nesting_level,expected_num_tables,expected_table_names",
+ "nesting_level_resource,nesting_level_source,expected_table_names",
(
- (0, 1, ROOT_TABLES),
- (1, 1, ROOT_TABLES),
- (2, 3, ALL_TABLES_FOR_RASA_EVENT_NESTING_LEVEL_2),
- (5, 8, ALL_TABLES_FOR_RASA_EVENT),
- (15, 8, ALL_TABLES_FOR_RASA_EVENT),
- (25, 8, ALL_TABLES_FOR_RASA_EVENT),
- (1000, 8, ALL_TABLES_FOR_RASA_EVENT),
+ (0, 3, NESTING_LEVEL_0), # resource overrides source
+ (1, 3, NESTING_LEVEL_1),
+ (2, 3, NESTING_LEVEL_2),
+ (3, 3, NESTING_LEVEL_3),
+ (4, 3, NESTING_LEVEL_4),
+ (5, 3, NESTING_LEVEL_4),
+ (6, 3, NESTING_LEVEL_4),
+ (None, 3, NESTING_LEVEL_3),
+ (None, 4, NESTING_LEVEL_4),
),
)
-def test_resource_max_nesting(
- nesting_level: int,
- expected_num_tables: int,
+def test_basic_resource_max_nesting(
+ nesting_level_resource: int,
+ nesting_level_source: int,
expected_table_names: List[str],
- rasa_event_bot_metadata: Dict[str, Any],
):
- @dlt.resource(max_table_nesting=nesting_level)
- def bot_events():
- yield rasa_event_bot_metadata
+ @dlt.resource(max_table_nesting=nesting_level_resource)
+ def base_table():
+ yield example_data
- assert "x-normalizer" in bot_events._hints
+ @dlt.source(max_table_nesting=nesting_level_source)
+ def source():
+ return base_table()
- pipeline_name = f"test_max_table_nesting_{nesting_level}_{expected_num_tables}"
- pipeline = dlt.pipeline(
- pipeline_name=pipeline_name,
- destination=dummy(timeout=0.1, completed_prob=1),
- dev_mode=True,
- )
+ if nesting_level_resource is not None:
+ assert "x-normalizer" in base_table._hints
+ else:
+ assert "x-normalizer" not in base_table._hints
+
+ pipeline = _get_pipeline()
+ pipeline.run(source())
- pipeline.run(bot_events)
- assert pipeline.schemas.keys()
- assert pipeline_name in pipeline.schema_names
+ all_table_names = pipeline.default_schema.data_table_names()
+ assert set(all_table_names) == set(
+ _table_names_for_base_table("base_table", expected_table_names)
+ )
- pipeline_schema = pipeline.schemas[pipeline_name]
- assert len(pipeline_schema.data_table_names()) == expected_num_tables
- all_table_names = pipeline_schema.data_table_names()
- for table_name in expected_table_names:
- assert table_name in all_table_names
+def test_multiple_configurations():
+ """test different settings on resources and source at the same time"""
+ @dlt.resource(max_table_nesting=2)
+ def base_table_1():
+ yield example_data
+
+ @dlt.resource(max_table_nesting=4)
+ def base_table_2():
+ yield example_data
+
+ # resource below will inherit from source
+ @dlt.resource()
+ def base_table_3():
+ yield example_data
+
+ @dlt.source(max_table_nesting=3)
+ def source():
+ return [base_table_1(), base_table_2(), base_table_3()]
+
+ pipeline = _get_pipeline()
+ pipeline.run(source())
+
+ all_table_names = pipeline.default_schema.data_table_names()
+ assert set(all_table_names) == set(
+ _table_names_for_base_table("base_table_1", NESTING_LEVEL_2)
+ + _table_names_for_base_table("base_table_2", NESTING_LEVEL_4)
+ + _table_names_for_base_table("base_table_3", NESTING_LEVEL_3)
+ )
-def test_with_multiple_resources_with_max_table_nesting_levels(
- rasa_event_bot_metadata: Dict[str, Any],
-):
- """Test max_table_nesting feature with multiple resources and a source
- Test scenario includes
-
- 1. Testing three different sources with set and unset `max_table_nesting` parameter
- and checks if the number of created tables in the schema match the expected numbers
- and the exact list table names have been collected;
- 2. For the same parent source we change the `max_table_nesting` and verify if it is respected
- by the third resource `third_resource_with_nested_data` as well as checking
- the number of created tables in the current schema;
- 3. Run combined test where we set `max_table_nesting` for the parent source and check
- if this `max_table_nesting` is respected by child resources where they don't define their
- own nesting level;
- 4. Run the pipeline with set `max_table_nesting` of a resource then override it and
- rerun the pipeline to check if the number and names of tables are expected;
- 5. Create source and resource both with defined `max_nesting_level` and check if we respect
- `max_nesting_level` from resource;
- """
- @dlt.resource(max_table_nesting=1)
- def rasa_bot_events_with_nesting_lvl_one():
- yield rasa_event_bot_metadata
+def test_update_table_nesting_level_resource():
+ """test if we can update the max_table_nesting level of a resource"""
@dlt.resource(max_table_nesting=2)
- def rasa_bot_events_with_nesting_lvl_two():
- yield rasa_event_bot_metadata
-
- all_table_names_for_third_resource = [
- "third_resource_with_nested_data",
- "third_resource_with_nested_data__payload__hints",
- "third_resource_with_nested_data__payload__hints__f_float",
- "third_resource_with_nested_data__payload__hints__f_float__comments",
- "third_resource_with_nested_data__params",
- ]
-
- @dlt.resource
- def third_resource_with_nested_data(): # first top level table `third_resource_with_nested_data`
- yield [
- {
- "id": 1,
- "payload": {
- "f_int": 7817289713,
- "f_float": 878172.8292,
- "f_timestamp": "2024-04-19T11:40:32.901899+00:00",
- "f_bool": False,
- "hints": [ # second table `third_resource_with_nested_data__payload__hints`
- {
- "f_bool": "bool",
- "f_timestamp": "bigint",
- "f_float": [ # third table `third_resource_with_nested_data__payload__hints__f_float`
- {
- "cond": "precision > 4",
- "then": "decimal",
- "else": "float",
- "comments": [ # fourth table `third_resource_with_nested_data__payload__hints__f_float__comments`
- {
- "text": "blabla bla bla we promise magix",
- "author": "bart",
- }
- ],
- }
- ],
- }
- ],
- },
- "params": [{"id": 1, "q": "search"}, {"id": 2, "q": "hashtag-search"}],
- }
- ]
-
- assert "x-normalizer" in rasa_bot_events_with_nesting_lvl_one._hints
- assert "x-normalizer" in rasa_bot_events_with_nesting_lvl_two._hints
- assert rasa_bot_events_with_nesting_lvl_one.max_table_nesting == 1
- assert rasa_bot_events_with_nesting_lvl_two.max_table_nesting == 2
- assert rasa_bot_events_with_nesting_lvl_one._hints["x-normalizer"]["max_nesting"] == 1 # type: ignore[typeddict-item]
- assert rasa_bot_events_with_nesting_lvl_two._hints["x-normalizer"]["max_nesting"] == 2 # type: ignore[typeddict-item]
- assert "x-normalizer" not in third_resource_with_nested_data._hints
-
- # Check scenario #1
- @dlt.source(max_table_nesting=100)
- def some_data():
- return [
- rasa_bot_events_with_nesting_lvl_one(),
- rasa_bot_events_with_nesting_lvl_two(),
- third_resource_with_nested_data(),
- ]
-
- pipeline_name = "test_different_table_nesting_levels"
- pipeline = dlt.pipeline(
- pipeline_name=pipeline_name,
- destination=dummy(timeout=0.1, completed_prob=1),
- dev_mode=True,
- )
+ def base_table_1():
+ yield example_data
- pipeline.run(some_data(), write_disposition="append")
- pipeline_schema = pipeline.schemas[pipeline.default_schema_name]
- all_table_names = pipeline_schema.data_table_names()
-
- # expect only one table for resource `rasa_bot_events_with_nesting_lvl_one`
- tables = [tbl for tbl in all_table_names if tbl.endswith("nesting_lvl_one")]
- assert len(tables) == 1
- assert tables == ["rasa_bot_events_with_nesting_lvl_one"]
-
- # expect three tables for resource `rasa_bot_events_with_nesting_lvl_two`
- tables = [tbl for tbl in all_table_names if "nesting_lvl_two" in tbl]
- assert len(tables) == 3
- assert tables == [
- "rasa_bot_events_with_nesting_lvl_two",
- "rasa_bot_events_with_nesting_lvl_two__metadata__known_recipients",
- "rasa_bot_events_with_nesting_lvl_two__metadata__vendor_list",
- ]
-
- # expect four tables for resource `third_resource_with_nested_data`
- tables = [tbl for tbl in all_table_names if "third_resource" in tbl]
- assert len(tables) == 5
- assert tables == all_table_names_for_third_resource
-
- # Check scenario #2
- # now we need to check `third_resource_with_nested_data`
- # using different nesting levels at the source level
- # First we do with max_table_nesting=0
- @dlt.source(max_table_nesting=0)
- def some_data_v2():
- yield third_resource_with_nested_data()
+ pipeline = _get_pipeline()
+ pipeline.run(base_table_1())
- pipeline.drop()
- pipeline.run(some_data_v2(), write_disposition="append")
- pipeline_schema = pipeline.schemas[pipeline.default_schema_name]
- all_table_names = pipeline_schema.data_table_names()
- assert len(all_table_names) == 1
- assert all_table_names == [
- "third_resource_with_nested_data",
- ]
-
- # Second we do with max_table_nesting=1
- some_data_source = some_data_v2()
- some_data_source.max_table_nesting = 1
+ all_table_names = pipeline.default_schema.data_table_names()
+ assert set(all_table_names) == set(_table_names_for_base_table("base_table_1", NESTING_LEVEL_2))
- pipeline.drop()
- pipeline.run(some_data_source, write_disposition="append")
- pipeline_schema = pipeline.schemas[pipeline.default_schema_name]
- all_table_names = pipeline_schema.data_table_names()
- assert len(all_table_names) == 2
- assert all_table_names == [
- "third_resource_with_nested_data",
- "third_resource_with_nested_data__params",
- ]
-
- # Second we do with max_table_nesting=2
- some_data_source = some_data_v2()
- some_data_source.max_table_nesting = 3
+ base_table_1.max_table_nesting = 3
+ assert base_table_1.max_table_nesting == 3
+ pipeline.run(base_table_1())
- pipeline.drop()
- pipeline.run(some_data_source, write_disposition="append")
- pipeline_schema = pipeline.schemas[pipeline.default_schema_name]
- all_table_names = pipeline_schema.data_table_names()
+ # NOTE: it will stay the same since the resource column at nesting level 2 is marked as complex
+ all_table_names = pipeline.default_schema.data_table_names()
+ assert set(all_table_names) == set(_table_names_for_base_table("base_table_1", NESTING_LEVEL_2))
- # 5 because payload is a dictionary not a collection of dictionaries
- assert len(all_table_names) == 5
- assert all_table_names == all_table_names_for_third_resource
+ # loading with alternative data works
+ @dlt.resource(max_table_nesting=3) # type: ignore[no-redef]
+ def base_table_1():
+ yield example_data_with_alternative_tree
- # Check scenario #3
- pipeline.drop()
- some_data_source = some_data()
- some_data_source.max_table_nesting = 0
- pipeline.run(some_data_source, write_disposition="append")
- pipeline_schema = pipeline.schemas[pipeline.default_schema_name]
- all_table_names = pipeline_schema.data_table_names()
- assert len(all_table_names) == 5
- assert sorted(all_table_names) == [
- "rasa_bot_events_with_nesting_lvl_one",
- "rasa_bot_events_with_nesting_lvl_two",
- "rasa_bot_events_with_nesting_lvl_two__metadata__known_recipients",
- "rasa_bot_events_with_nesting_lvl_two__metadata__vendor_list",
- "third_resource_with_nested_data",
- ]
-
- # Check scenario #4
- # Set max_table_nesting via the setter and check the tables
- pipeline.drop()
- rasa_bot_events_resource = rasa_bot_events_with_nesting_lvl_one()
- pipeline.run(
- rasa_bot_events_resource,
- dataset_name="bot_events",
- write_disposition="append",
+ pipeline.run(base_table_1())
+ all_table_names = pipeline.default_schema.data_table_names()
+ assert set(all_table_names) == set(
+ _table_names_for_base_table("base_table_1", NESTING_LEVEL_2)
+ ).union(_table_names_for_base_table("base_table_1", NESTING_LEVEL_3, alternative_tree=True))
+
+
+def test_update_table_nesting_level_source():
+ """test if we can update the max_table_nesting level of a source"""
+
+ @dlt.resource()
+ def base_table_1():
+ yield example_data
+
+ @dlt.resource()
+ def base_table_2():
+ yield example_data
+
+ @dlt.resource(max_table_nesting=1)
+ def base_table_3():
+ yield example_data
+
+ @dlt.source(max_table_nesting=3)
+ def source():
+ return [base_table_1(), base_table_2(), base_table_3()]
+
+ pipeline = _get_pipeline()
+ pipeline.run(source().with_resources("base_table_1"))
+
+ assert set(pipeline.default_schema.data_table_names()) == set(
+ _table_names_for_base_table("base_table_1", NESTING_LEVEL_3)
)
- pipeline_schema = pipeline.schemas[pipeline.default_schema_name]
- all_table_names = pipeline_schema.data_table_names()
- count_all_tables_first_run = len(all_table_names)
- tables = pipeline_schema.data_table_names()
- assert count_all_tables_first_run == 1
- assert tables == ["rasa_bot_events_with_nesting_lvl_one"]
-
- # now adjust the max_table_nesting for resource and check
- pipeline.drop()
- rasa_bot_events_resource.max_table_nesting = 2
- assert rasa_bot_events_resource.max_table_nesting == 2
- pipeline.run(
- rasa_bot_events_resource,
- dataset_name="bot_events",
- write_disposition="append",
+
+ # change the max_table_nesting level of the source and load another formerly unloaded resource
+ source.max_table_nesting = 4 # type: ignore
+ pipeline.run(source().with_resources("base_table_1", "base_table_2"))
+
+ # for base_table_1 it will stay the same since it is already loaded
+ assert set(pipeline.default_schema.data_table_names()) == set(
+ _table_names_for_base_table("base_table_1", NESTING_LEVEL_3)
+ + _table_names_for_base_table("base_table_2", NESTING_LEVEL_4)
)
- all_table_names = pipeline_schema.data_table_names()
- count_all_tables_second_run = len(all_table_names)
- assert count_all_tables_first_run < count_all_tables_second_run
-
- tables = pipeline_schema.data_table_names()
- assert count_all_tables_second_run == 3
- assert tables == [
- "rasa_bot_events_with_nesting_lvl_one",
- "rasa_bot_events_with_nesting_lvl_one__metadata__known_recipients",
- "rasa_bot_events_with_nesting_lvl_one__metadata__vendor_list",
- ]
- pipeline.drop()
- rasa_bot_events_resource.max_table_nesting = 10
- assert rasa_bot_events_resource.max_table_nesting == 10
- pipeline.run(rasa_bot_events_resource, dataset_name="bot_events")
- all_table_names = pipeline_schema.data_table_names()
- count_all_tables_second_run = len(all_table_names)
- assert count_all_tables_first_run < count_all_tables_second_run
-
- tables = pipeline_schema.data_table_names()
- assert count_all_tables_second_run == 8
- assert tables == [
- "rasa_bot_events_with_nesting_lvl_one",
- "rasa_bot_events_with_nesting_lvl_one__metadata__known_recipients",
- "rasa_bot_events_with_nesting_lvl_one__metadata__transaction_history__spend__target",
- "rasa_bot_events_with_nesting_lvl_one__metadata__transaction_history__spend__starbucks",
- "rasa_bot_events_with_nesting_lvl_one__metadata__transaction_history__spend__amazon",
- "rasa_bot_events_with_nesting_lvl_one__metadata__transaction_history__deposit__employer",
- "rasa_bot_events_with_nesting_lvl_one__metadata__transaction_history__deposit__interest",
- "rasa_bot_events_with_nesting_lvl_one__metadata__vendor_list",
- ]
+ # load full source (resource 3 max_table_nesting will be taken from resource)
+ pipeline.run(source())
- pipeline.drop()
- third_resource_with_nested_data.max_table_nesting = 10
- assert third_resource_with_nested_data.max_table_nesting == 10
- pipeline.run(third_resource_with_nested_data)
- all_table_names = pipeline_schema.data_table_names()
- count_all_tables_second_run = len(all_table_names)
- assert count_all_tables_first_run < count_all_tables_second_run
-
- tables_with_nesting_level_set = pipeline_schema.data_table_names()
- assert count_all_tables_second_run == 5
- assert tables_with_nesting_level_set == all_table_names_for_third_resource
-
- # Set max_table_nesting=None and check if the same tables exist
- third_resource_with_nested_data.max_table_nesting = None
- assert third_resource_with_nested_data.max_table_nesting is None
- pipeline.run(third_resource_with_nested_data)
- all_table_names = pipeline_schema.data_table_names()
- count_all_tables_second_run = len(all_table_names)
- assert count_all_tables_first_run < count_all_tables_second_run
-
- tables = pipeline_schema.data_table_names()
- assert count_all_tables_second_run == 5
- assert tables == all_table_names_for_third_resource
- assert tables == tables_with_nesting_level_set
-
- # Check scenario #5
- # We give priority `max_table_nesting` of the resource if it is defined
- @dlt.source(max_table_nesting=1000)
- def some_data_with_table_nesting():
- yield rasa_bot_events_with_nesting_lvl_one()
+ assert set(pipeline.default_schema.data_table_names()) == set(
+ _table_names_for_base_table("base_table_1", NESTING_LEVEL_3)
+ + _table_names_for_base_table("base_table_2", NESTING_LEVEL_4)
+ + _table_names_for_base_table("base_table_3", NESTING_LEVEL_1)
+ )
- pipeline.drop()
- pipeline.run(some_data_with_table_nesting())
- pipeline_schema = pipeline.schemas[pipeline.default_schema_name]
- tables = pipeline_schema.data_table_names()
- assert len(tables) == 1
- assert tables == ["rasa_bot_events_with_nesting_lvl_one"]
- # Now check the case when `max_table_nesting` is not defined in the resource
- rasa_bot_events_with_nesting_lvl_one.max_table_nesting = None
+@pytest.mark.parametrize("nesting_defininition_location", ["resource", "source"])
+def test_nesting_levels_reset_after_drop(nesting_defininition_location: str):
+ """test if the nesting levels are reset after a drop"""
+
+ @dlt.resource(max_table_nesting=2 if nesting_defininition_location == "resource" else None)
+ def base_table_1():
+ yield example_data
+
+ @dlt.source(max_table_nesting=2 if nesting_defininition_location == "source" else None)
+ def source():
+ return base_table_1()
+
+ pipeline = _get_pipeline()
+ pipeline.run(source())
+
+ all_table_names = pipeline.default_schema.data_table_names()
+ assert set(all_table_names) == set(_table_names_for_base_table("base_table_1", NESTING_LEVEL_2))
pipeline.drop()
- pipeline.run(some_data_with_table_nesting())
- pipeline_schema = pipeline.schemas[pipeline.default_schema_name]
- tables = pipeline_schema.data_table_names()
- assert len(tables) == 8
- assert tables == [
- "rasa_bot_events_with_nesting_lvl_one",
- "rasa_bot_events_with_nesting_lvl_one__metadata__known_recipients",
- "rasa_bot_events_with_nesting_lvl_one__metadata__transaction_history__spend__target",
- "rasa_bot_events_with_nesting_lvl_one__metadata__transaction_history__spend__starbucks",
- "rasa_bot_events_with_nesting_lvl_one__metadata__transaction_history__spend__amazon",
- "rasa_bot_events_with_nesting_lvl_one__metadata__transaction_history__deposit__employer",
- "rasa_bot_events_with_nesting_lvl_one__metadata__transaction_history__deposit__interest",
- "rasa_bot_events_with_nesting_lvl_one__metadata__vendor_list",
- ]
+ if nesting_defininition_location == "resource":
+ base_table_1.max_table_nesting = 3
+ else:
+ source.max_table_nesting = 3 # type: ignore[attr-defined]
+ pipeline.run(source())
+
+ all_table_names = pipeline.default_schema.data_table_names()
+ assert set(all_table_names) == set(_table_names_for_base_table("base_table_1", NESTING_LEVEL_3))
diff --git a/tests/pipeline/test_dlt_versions.py b/tests/pipeline/test_dlt_versions.py
index c7a8832214..a3d8b489c9 100644
--- a/tests/pipeline/test_dlt_versions.py
+++ b/tests/pipeline/test_dlt_versions.py
@@ -247,7 +247,7 @@ def test_filesystem_pipeline_with_dlt_update(test_storage: FileStorage) -> None:
# attach to existing pipeline
pipeline = dlt.attach(GITHUB_PIPELINE_NAME, destination=filesystem("_storage/data"))
# assert end state
- assert_github_pipeline_end_state(pipeline, github_schema, 2)
+ pipeline = assert_github_pipeline_end_state(pipeline, github_schema, 2)
# load new state
fs_client = pipeline._fs_client()
state_files = sorted(fs_client.list_table_files("_dlt_pipeline_state"))
@@ -261,7 +261,7 @@ def test_filesystem_pipeline_with_dlt_update(test_storage: FileStorage) -> None:
def assert_github_pipeline_end_state(
pipeline: dlt.Pipeline, orig_schema: TStoredSchema, schema_updates: int
-) -> None:
+) -> dlt.Pipeline:
# get tables counts
table_counts = load_table_counts(pipeline, *pipeline.default_schema.data_table_names())
assert table_counts == {"issues": 100, "issues__assignees": 31, "issues__labels": 34}
@@ -286,6 +286,8 @@ def assert_github_pipeline_end_state(
# make sure that schema hash retrieved from the destination is exactly the same as the schema hash that was in storage before the schema was wiped
assert pipeline.default_schema.stored_version_hash == orig_schema["version_hash"]
+ return pipeline
+
def test_load_package_with_dlt_update(test_storage: FileStorage) -> None:
shutil.copytree("tests/pipeline/cases/github_pipeline", TEST_STORAGE_ROOT, dirs_exist_ok=True)
diff --git a/tests/pipeline/test_pipeline.py b/tests/pipeline/test_pipeline.py
index 73125cbd6c..38ba3713bb 100644
--- a/tests/pipeline/test_pipeline.py
+++ b/tests/pipeline/test_pipeline.py
@@ -71,12 +71,11 @@ def test_default_pipeline() -> None:
p = dlt.pipeline()
# this is a name of executing test harness or blank pipeline on windows
possible_names = ["dlt_pytest", "dlt_pipeline"]
- possible_dataset_names = ["dlt_pytest_dataset", "dlt_pipeline_dataset"]
assert p.pipeline_name in possible_names
assert p.pipelines_dir == os.path.abspath(os.path.join(TEST_STORAGE_ROOT, ".dlt", "pipelines"))
assert p.runtime_config.pipeline_name == p.pipeline_name
- # dataset that will be used to load data is the pipeline name
- assert p.dataset_name in possible_dataset_names
+ # default dataset name is not created until a destination that requires it is set
+ assert p.dataset_name is None
assert p.destination is None
assert p.default_schema_name is None
@@ -95,7 +94,8 @@ def test_default_pipeline_dataset_layout(environment) -> None:
dataset_name_layout = "bobby_%s"
environment["DATASET_NAME_LAYOUT"] = dataset_name_layout
- p = dlt.pipeline()
+ # use destination that needs a dataset
+ p = dlt.pipeline(destination="filesystem")
# this is a name of executing test harness or blank pipeline on windows
possible_names = ["dlt_pytest", "dlt_pipeline"]
possible_dataset_names = [
@@ -107,7 +107,6 @@ def test_default_pipeline_dataset_layout(environment) -> None:
assert p.runtime_config.pipeline_name == p.pipeline_name
# dataset that will be used to load data is the pipeline name
assert p.dataset_name in possible_dataset_names
- assert p.destination is None
assert p.default_schema_name is None
# this is the same pipeline
@@ -121,14 +120,42 @@ def test_default_pipeline_dataset_layout(environment) -> None:
def test_default_pipeline_dataset() -> None:
- # dummy does not need a dataset
- p = dlt.pipeline(destination="dummy")
+ # no dataset and no destination
+ p = dlt.pipeline(pipeline_name="test_default_pipeline")
+ assert p.dataset_name is None
+ p._wipe_working_folder()
+
+ # dummy does not need a dataset (is schemaless)
+ p = dlt.pipeline(pipeline_name="test_default_pipeline", destination="dummy")
assert p.dataset_name is None # so it is none
+ p._wipe_working_folder()
+
+ # clickhouse has optional dataset
+ p = dlt.pipeline(pipeline_name="test_default_pipeline", destination="clickhouse")
+ assert p.dataset_name is None
+ p._wipe_working_folder()
# filesystem needs one
- possible_dataset_names = ["dlt_pytest_dataset", "dlt_pipeline_dataset"]
- p = dlt.pipeline(destination="filesystem")
- assert p.dataset_name in possible_dataset_names
+ p = dlt.pipeline(pipeline_name="test_default_pipeline", destination="filesystem")
+ assert p.dataset_name == "test_default_pipeline_dataset"
+ p._wipe_working_folder()
+
+
+def test_default_pipeline_dataset_late_destination() -> None:
+ # no dataset and no destination
+ p = dlt.pipeline(pipeline_name="test_default_pipeline")
+ assert p.dataset_name is None
+
+ # default dataset name will be created
+ p.sync_destination(destination=dlt.destinations.filesystem(TEST_STORAGE_ROOT))
+ assert p.dataset_name == "test_default_pipeline_dataset"
+ p._wipe_working_folder()
+
+ p = dlt.pipeline(pipeline_name="test_default_pipeline")
+ # dummy won't set dataset
+ p.sync_destination(destination="dummy")
+ print(p.dataset_name)
+ assert p.dataset_name is None
def test_default_pipeline_dataset_name(environment) -> None:
@@ -231,9 +258,9 @@ def test_pipeline_with_non_alpha_name() -> None:
p = dlt.pipeline(pipeline_name=name)
name = "another pipeline __8329イロハニホヘト"
- p = dlt.pipeline(pipeline_name=name)
+ p = dlt.pipeline(pipeline_name=name, destination="filesystem")
assert p.pipeline_name == name
- # default dataset is set
+ # default dataset is set (we used filesystem destination that requires dataset)
assert p.dataset_name == f"{name}_dataset"
# also pipeline name in runtime must be correct
assert p.runtime_config.pipeline_name == p.pipeline_name
@@ -1518,6 +1545,21 @@ def test_drop_with_new_name() -> None:
assert new_pipeline.pipeline_name == new_test_name
+ # load to old pipeline
+ pipeline.run([1, 2, 3], table_name="p1")
+ new_pipeline.run([1, 2, 3], table_name="p2")
+
+ assert_data_table_counts(pipeline, {"p1": 3})
+ assert_data_table_counts(new_pipeline, {"p2": 3})
+
+
+def test_drop() -> None:
+ pipeline = dlt.pipeline(pipeline_name="test_drop", destination="duckdb")
+ clean_pipeline = pipeline.drop()
+ assert clean_pipeline is pipeline
+ assert clean_pipeline.pipeline_name == "test_drop"
+ pipeline.run([1, 2, 3], table_name="numbers")
+
def test_schema_version_increase_and_source_update() -> None:
now = pendulum.now()
@@ -2806,3 +2848,32 @@ def events():
pipeline = dlt.pipeline(destination="duckdb")
pipeline.run(events())
+
+
+def test_push_table_with_upfront_schema() -> None:
+ # infer schema
+ pipeline = dlt.pipeline(pipeline_name="push_table_infer_pipeline", destination="duckdb")
+ info = pipeline.run(_get_shuffled_events())
+ assert_load_info(info)
+
+ # get resource as table
+ data = list(_get_shuffled_events())
+
+ # save into other pipeline
+ infer_hash = pipeline.default_schema.version_hash
+ copy_pipeline = dlt.pipeline(pipeline_name="push_table_copy_pipeline", destination="duckdb")
+ info = copy_pipeline.run(
+ data, table_name="_get_shuffled_events", schema=pipeline.default_schema
+ )
+ copy_schema = copy_pipeline.default_schema
+ # make sure that schema hash didn't change - we didn't change anything in the data
+ assert copy_pipeline.default_schema.version_hash == infer_hash
+ copy_pipeline = dlt.pipeline(pipeline_name="push_table_copy_pipeline", destination="duckdb")
+ info = copy_pipeline.run(data, table_name="_get_shuffled_events", schema=copy_schema)
+ assert copy_pipeline.default_schema.version_hash == infer_hash
+ copy_schema = copy_pipeline.default_schema
+
+ # another table
+ copy_pipeline = dlt.pipeline(pipeline_name="push_table_copy_pipeline", destination="duckdb")
+ info = copy_pipeline.run(data, table_name="events", schema=copy_schema)
+ assert copy_pipeline.default_schema.version_hash != infer_hash
diff --git a/tests/pipeline/test_pipeline_trace.py b/tests/pipeline/test_pipeline_trace.py
index 784e0447ff..cc0616fc9a 100644
--- a/tests/pipeline/test_pipeline_trace.py
+++ b/tests/pipeline/test_pipeline_trace.py
@@ -586,9 +586,17 @@ def data():
assert event["properties"]["destination_name"] is None
assert event["properties"]["destination_type"] is None
assert event["properties"]["pipeline_name_hash"] == digest128("fresh")
- assert event["properties"]["dataset_name_hash"] == digest128(p.dataset_name)
+ assert event["properties"]["dataset_name_hash"] is None
assert event["properties"]["default_schema_name_hash"] == digest128(p.default_schema_name)
+ # trace with dataset name
+ p = dlt.pipeline(pipeline_name="fresh", dataset_name="fresh_dataset").drop()
+ ANON_TRACKER_SENT_ITEMS.clear()
+ SENTRY_SENT_ITEMS.clear()
+ p.extract([1, 2, 3], table_name="data")
+ event = ANON_TRACKER_SENT_ITEMS[0]
+ assert event["properties"]["dataset_name_hash"] == digest128(p.dataset_name)
+
def test_extract_data_describe() -> None:
schema = Schema("test")
diff --git a/tests/sources/filesystem/test_filesystem_pipeline_template.py b/tests/sources/filesystem/test_filesystem_pipeline_template.py
index 38c51c110c..e4b05cb76d 100644
--- a/tests/sources/filesystem/test_filesystem_pipeline_template.py
+++ b/tests/sources/filesystem/test_filesystem_pipeline_template.py
@@ -15,7 +15,7 @@
),
)
def test_all_examples(example_name: str) -> None:
- from dlt.sources import filesystem_pipeline
+ from dlt.sources._core_source_templates import filesystem_pipeline
filesystem_pipeline.TESTS_BUCKET_URL = TEST_SAMPLE_FILES
diff --git a/tests/sources/helpers/rest_client/test_paginators.py b/tests/sources/helpers/rest_client/test_paginators.py
index 49a6275536..85276a263f 100644
--- a/tests/sources/helpers/rest_client/test_paginators.py
+++ b/tests/sources/helpers/rest_client/test_paginators.py
@@ -530,6 +530,12 @@ def test_update_state(self):
assert paginator._next_reference == "cursor-2"
assert paginator.has_next_page is True
+ def test_update_state_when_cursor_path_is_empty_string(self):
+ paginator = JSONResponseCursorPaginator(cursor_path="next_cursor")
+ response = Mock(Response, json=lambda: {"next_cursor": "", "results": []})
+ paginator.update_state(response)
+ assert paginator.has_next_page is False
+
def test_update_request(self):
paginator = JSONResponseCursorPaginator(cursor_path="next_cursor")
paginator._next_reference = "cursor-2"
diff --git a/tests/sources/rest_api/configurations/test_incremental_config.py b/tests/sources/rest_api/configurations/test_incremental_config.py
index a374b644df..0527cc9c72 100644
--- a/tests/sources/rest_api/configurations/test_incremental_config.py
+++ b/tests/sources/rest_api/configurations/test_incremental_config.py
@@ -112,11 +112,13 @@ def test_constructs_incremental_from_request_param() -> None:
"type": "incremental",
"cursor_path": "updated_at",
"initial_value": "2024-01-01T00:00:00Z",
+ "lag": 360.5,
},
}
- (incremental_config, incremental_param, _) = setup_incremental_object(request_params)
- assert incremental_config == dlt.sources.incremental(
- cursor_path="updated_at", initial_value="2024-01-01T00:00:00Z"
+ (incremental, incremental_param, _) = setup_incremental_object(request_params)
+ # incremental is a dataclass so you can compare field-wise
+ assert incremental == dlt.sources.incremental(
+ cursor_path="updated_at", initial_value="2024-01-01T00:00:00Z", lag=360.5
)
assert incremental_param == IncrementalParam(start="since", end=None)
diff --git a/tests/sources/rest_api/test_rest_api_pipeline_template.py b/tests/sources/rest_api/test_rest_api_pipeline_template.py
index b397984d9f..786dd10931 100644
--- a/tests/sources/rest_api/test_rest_api_pipeline_template.py
+++ b/tests/sources/rest_api/test_rest_api_pipeline_template.py
@@ -12,7 +12,7 @@
),
)
def test_all_examples(example_name: str) -> None:
- from dlt.sources import rest_api_pipeline
+ from dlt.sources._core_source_templates import rest_api_pipeline
# reroute token location from secrets
github_token: TSecretStrValue = dlt.secrets.get("sources.github.access_token")
diff --git a/tests/sources/sql_database/test_sql_database_pipeline_template.py b/tests/sources/sql_database/test_sql_database_pipeline_template.py
index 88c05ea333..bf2f8c3707 100644
--- a/tests/sources/sql_database/test_sql_database_pipeline_template.py
+++ b/tests/sources/sql_database/test_sql_database_pipeline_template.py
@@ -17,6 +17,6 @@
),
)
def test_all_examples(example_name: str) -> None:
- from dlt.sources import sql_database_pipeline
+ from dlt.sources._core_source_templates import sql_database_pipeline
getattr(sql_database_pipeline, example_name)()
diff --git a/tests/sources/test_pipeline_templates.py b/tests/sources/test_pipeline_templates.py
index a83ccff67f..a257e14a66 100644
--- a/tests/sources/test_pipeline_templates.py
+++ b/tests/sources/test_pipeline_templates.py
@@ -15,6 +15,6 @@
],
)
def test_debug_pipeline(template_name: str, examples: str) -> None:
- demo_module = importlib.import_module(f"dlt.sources.pipeline_templates.{template_name}")
+ demo_module = importlib.import_module(f"dlt.sources._single_file_templates.{template_name}")
for example_name in examples:
getattr(demo_module, example_name)()
diff --git a/tests/tests/__init__.py b/tests/tests/__init__.py
new file mode 100644
index 0000000000..44b3c3adc8
--- /dev/null
+++ b/tests/tests/__init__.py
@@ -0,0 +1 @@
+"""Tests for our test helpers"""
diff --git a/tests/tests/load/__init__.py b/tests/tests/load/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/tests/tests/load/test_utils.py b/tests/tests/load/test_utils.py
new file mode 100644
index 0000000000..6547b21663
--- /dev/null
+++ b/tests/tests/load/test_utils.py
@@ -0,0 +1,49 @@
+# NOTE: these should be run with all destinations enabled
+# NOTE: these are very rudimentary tests, they should be extended
+
+from typing import Any
+
+from tests.load.utils import destinations_configs, DEFAULT_BUCKETS, DestinationTestConfiguration
+
+
+def test_empty_destinations_configs():
+ configs = destinations_configs()
+ assert len(configs) == 0
+
+
+def _assert_name_uniqueness(configs: Any) -> None:
+ identifiers = [config[0][0].name for config in configs]
+ print(identifiers)
+ assert len(identifiers) == len(set(identifiers)), "Identifier uniqueness violated"
+
+
+def test_enable_filesystem_configs():
+ # enable local filesystem configs
+ configs = destinations_configs(local_filesystem_configs=True)
+ _assert_name_uniqueness(configs)
+ assert len(configs) == 3
+
+ # enable all buckets configs
+ configs = destinations_configs(all_buckets_filesystem_configs=True)
+ _assert_name_uniqueness(configs)
+ assert len(configs) == len(DEFAULT_BUCKETS) == 7 # hardcoded for now
+
+ # enable with delta tables
+ configs = destinations_configs(
+ all_buckets_filesystem_configs=True, table_format_filesystem_configs=True
+ )
+ _assert_name_uniqueness(configs)
+ assert (
+ len(configs) == len(DEFAULT_BUCKETS) * 2 == 7 * 2
+ ) # we have delta now, so double the expected amount
+
+
+def test_uniqueness_of_names():
+ configs = destinations_configs(
+ default_sql_configs=True,
+ default_vector_configs=True,
+ default_staging_configs=True,
+ all_staging_configs=True,
+ all_buckets_filesystem_configs=True,
+ )
+ _assert_name_uniqueness(configs)
diff --git a/tests/utils.py b/tests/utils.py
index 8ae301a4ab..1aafa4bfe4 100644
--- a/tests/utils.py
+++ b/tests/utils.py
@@ -157,9 +157,27 @@ def autouse_test_storage() -> FileStorage:
@pytest.fixture(scope="function", autouse=True)
def preserve_environ() -> Iterator[None]:
saved_environ = environ.copy()
- yield
- environ.clear()
- environ.update(saved_environ)
+ # delta-rs sets those keys without updating environ and there's no
+ # method to refresh environ
+ known_environ = {
+ key_: saved_environ.get(key_)
+ for key_ in [
+ "AWS_ACCESS_KEY_ID",
+ "AWS_SECRET_ACCESS_KEY",
+ "AWS_REGION",
+ "AWS_SESSION_TOKEN",
+ ]
+ }
+ try:
+ yield
+ finally:
+ environ.clear()
+ environ.update(saved_environ)
+ for key_, value_ in known_environ.items():
+ if value_ is not None or key_ not in environ:
+ environ[key_] = value_ or ""
+ else:
+ del environ[key_]
@pytest.fixture(autouse=True)