-
Notifications
You must be signed in to change notification settings - Fork 0
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
feat: add remote table materialization #8
base: dap-main
Are you sure you want to change the base?
Changes from 1 commit
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -32,7 +32,7 @@ | |
db.engine as db_engine, | ||
{%- if adapter.get_clickhouse_cluster_name() -%} | ||
count(distinct _shard_num) > 1 as is_on_cluster | ||
from clusterAllReplicas({{ adapter.get_clickhouse_cluster_name() }}, system.tables) as t | ||
from clusterAllReplicas('{{ adapter.get_clickhouse_cluster_name() }}', system.tables) as t | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. All three variants work select * from cluster(default, system.one);
select * from cluster('default', system.one);
select * from cluster("default", system.one); There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I was worried about passing reserved names to the first parameter |
||
join system.databases as db on t.database = db.name | ||
where schema = '{{ schema_relation.schema }}' | ||
group by name, schema, type, db_engine | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -86,24 +86,30 @@ | |
|
||
{% endmaterialization %} | ||
|
||
{% macro create_distributed_table(relation, local_relation) %} | ||
{%- set cluster = adapter.get_clickhouse_cluster_name() -%} | ||
{% if cluster is none %} | ||
{% do exceptions.raise_compiler_error('Cluster name should be defined for using distributed materializations, current is None') %} | ||
{% endif %} | ||
{% macro create_distributed_table(relation, local_relation, sql=none) %} | ||
{% if adapter.get_clickhouse_cluster_name() is none %} | ||
{% do exceptions.raise_compiler_error('Cluster name should be defined for using distributed materializations, current is None') %} | ||
{% endif %} | ||
|
||
{%- set cluster = cluster[1:-1] -%} | ||
{%- set sharding = config.get('sharding_key') -%} | ||
|
||
create or replace table {{ relation }} {{ on_cluster_clause(relation) }} as {{ local_relation }} | ||
ENGINE = Distributed('{{ cluster}}', '{{ local_relation.schema }}', '{{ local_relation.name }}' | ||
{%- if sharding is not none and sharding.strip() != '' -%} | ||
, {{ sharding }} | ||
{%- else %} | ||
, rand() | ||
{% endif -%} | ||
) | ||
{% endmacro %} | ||
{%- set remote_cluster = local_relation.remote_cluster or adapter.get_clickhouse_cluster_name() -%} | ||
{%- set sharding = config.get('sharding_key') -%} | ||
{%- set reference = "as " ~ local_relation -%} | ||
{%- if sql -%} | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. what is the use case for having an SQL when creating a distributed table? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. you don't rely on an existing local relation for the ddl query. the sql will allow to directly pass the column names and types |
||
{%- set col_list = [] -%} | ||
{%- for col in adapter.get_column_schema_from_query(sql) -%} | ||
{%- do col_list.append(col.name + ' ' + col.data_type) -%} | ||
{%- endfor -%} | ||
{%- set reference = "(" ~ (col_list | join(', ')) ~ ")" -%} | ||
{%- endif -%} | ||
create or replace table {{ relation }} {{ on_cluster_clause(relation) }} {{ reference }} | ||
engine = Distributed('{{ remote_cluster }}', '{{ local_relation.schema }}', '{{ local_relation.name }}' | ||
{%- if sharding is not none and sharding.strip() != '' -%} | ||
, {{ sharding }} | ||
{%- else %} | ||
, rand() | ||
{% endif -%} | ||
) | ||
{% endmacro %} | ||
|
||
{% macro create_empty_table_from_relation(relation, source_relation, sql=none) -%} | ||
{%- set sql_header = config.get('sql_header', none) -%} | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,27 @@ | ||
{% materialization remote_table, adapter='clickhouse', supported_languages=['python', 'sql'] -%} | ||
{%- set remote_config = config.get('remote_config', {}) -%} | ||
{%- set remote_cluster = remote_config.get('cluster') or adapter.get_clickhouse_cluster_name() -%} | ||
{%- set remote_schema = remote_config.get('schema') or adapter.get_clickhouse_local_db_prefix() + this.schema -%} | ||
{%- set remote_identifier = remote_config.get('identifier') or this.identifier + adapter.get_clickhouse_local_suffix() -%} | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Will |
||
|
||
{% set target_relation = this.incorporate(type='table') %} | ||
{% set remote_relation = target_relation.incorporate(path={"identifier": remote_identifier, "schema": remote_schema}, remote_cluster=remote_cluster) %} | ||
|
||
{{ run_hooks(pre_hooks, inside_transaction=False) }} | ||
{{ run_hooks(pre_hooks, inside_transaction=True) }} | ||
|
||
{% call statement('main') %} | ||
{{ create_distributed_table(target_relation, remote_relation, sql) }} | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. where is the sql defined? |
||
{% endcall %} | ||
|
||
{% set should_revoke = should_revoke(target_relation, full_refresh_mode) %} | ||
{% set grant_config = config.get('grants') %} | ||
{% do apply_grants(target_relation, grant_config, should_revoke=should_revoke) %} | ||
{% do persist_docs(target_relation, model) %} | ||
|
||
{{ run_hooks(post_hooks, inside_transaction=True) }} | ||
{{ adapter.commit() }} | ||
{{ run_hooks(post_hooks, inside_transaction=False) }} | ||
{{ return({'relations': [target_relation]}) }} | ||
|
||
{% endmaterialization %} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,97 @@ | ||
import pytest | ||
from dbt.tests.util import run_dbt | ||
|
||
|
||
model_with_defaults = """ | ||
{{ | ||
config( | ||
materialized='remote_table', | ||
) | ||
}} | ||
select toUInt64(number) as key1, toInt64(-number) as key2 from numbers(10) | ||
""" | ||
|
||
|
||
class TestRemoteTable: | ||
@pytest.fixture(scope="class") | ||
def models(self): | ||
return { | ||
"remote_table.sql": model_with_defaults, | ||
} | ||
|
||
def test_with_defaults(self, project): | ||
# initialize a local table on current cluster | ||
project.run_sql(f""" | ||
create table {project.test_schema}.remote_table_local on cluster {project.test_config["cluster"]} | ||
(key1 UInt64, key2 Int64) | ||
engine=ReplicatedMergeTree order by key1 | ||
""") | ||
|
||
# this `remote_table` run with default configuration will point to previously created local table, taking | ||
# `local_db_prefix` and `local_suffix` settings into account. | ||
run_dbt() | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. create or replace table `dbt_clickhouse_3961_test_remote_table_1736978435032`.`remote_table`
(key1 UInt64, key2 Int64)
engine = Distributed('test_shard', 'dbt_clickhouse_3961_test_remote_table_1736978435032', 'remote_table_local'
, rand()
) I copied this DDL from test debugging, and I have two questions.
|
||
|
||
# insert into distributed table | ||
project.run_sql(f""" | ||
insert into {project.test_schema}.remote_table | ||
select toUInt64(number) as key1, toInt64(-number) as key2 from numbers(10) | ||
settings insert_quorum=1 | ||
""") | ||
|
||
_assert_is_distributed_table(project) | ||
_assert_correct_distributed_data(project) | ||
|
||
|
||
model_with_remote_configuration = """ | ||
{{ | ||
config( | ||
materialized='remote_table', | ||
remote_config={'cluster': 'test_remote', 'schema': this.schema, 'identifier': 'remote_target_table'}, | ||
sharding_key='key1', | ||
) | ||
}} | ||
select toUInt64(number) as key1, toInt64(-number) as key2 from numbers(10) | ||
""" | ||
|
||
|
||
class TestRemoteTableRemoteConfig: | ||
@pytest.fixture(scope="class") | ||
def models(self): | ||
return { | ||
"remote_table.sql": model_with_remote_configuration, | ||
} | ||
|
||
def test_with_remote_configuration(self, project): | ||
# initialize a local table on remote cluster 'test_remote_cluster' | ||
project.run_sql(f"create database if not exists {project.test_schema} on cluster `test_remote`") | ||
project.run_sql(f""" | ||
create table {project.test_schema}.remote_target_table on cluster `test_remote` | ||
engine=MergeTree order by key1 | ||
as select toUInt64(number) as key1, toInt64(-number) as key2 from numbers(10) | ||
""") | ||
|
||
# the created distributed table should point to a local table as defined in the model's `remote_config` | ||
run_dbt() | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. create or replace table `dbt_clickhouse_3076_test_remote_table_1736979527642`.`remote_table`
(key1 UInt64, key2 Int64)
engine = Distributed('test_remote', 'dbt_clickhouse_3076_test_remote_table_1736979527642', 'remote_target_table', key1) Is |
||
|
||
_assert_is_distributed_table(project) | ||
_assert_correct_distributed_data(project) | ||
|
||
# assert correct engine parameters | ||
result = project.run_sql(f"select create_table_query from system.tables where name='remote_table'", fetch="one") | ||
assert f"Distributed('test_remote', '{project.test_schema}', 'remote_target_table', key1)" in result[0] | ||
|
||
|
||
def _assert_is_distributed_table(project): | ||
# check correct table creation on current host | ||
result = project.run_sql( | ||
f"select engine from system.tables where name='remote_table'", | ||
fetch="one" | ||
) | ||
assert result is not None | ||
assert result[0] == "Distributed" | ||
|
||
|
||
def _assert_correct_distributed_data(project): | ||
# query remote data from distributed table | ||
result = project.run_sql("select count(*) as num_rows from remote_table", fetch="one") | ||
assert result[0] == 10 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
also works :D
code
dag