diff --git a/.circleci/config.yml b/.circleci/config.yml index f1c1b46..bd6f7a4 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -146,7 +146,7 @@ workflows: context: DBT_SYNAPSE_PROFILE - integration-dbt-utils-azuresql: *dbt-context - integration-dbt-audit-helper-azuresql: *dbt-context - # - integration-dbt-expectations-azuresql: *dbt-context + - integration-dbt-expectations-azuresql: *dbt-context - integration-dbt-date-azuresql: *dbt-context - integration-dbt-utils-synapse: &syn-step <<: *dbt-context @@ -156,12 +156,12 @@ workflows: <<: *dbt-context requires: - start-synapse - # - integration-dbt-expectations-synapse: *syn-step + - integration-dbt-expectations-synapse: *syn-step - integration-dbt-date-synapse: *syn-step - pause-synapse: <<: *dbt-context requires: - integration-dbt-utils-synapse - integration-dbt-audit-helper-synapse - # - integration-dbt-expectations-synapse + - integration-dbt-expectations-synapse - integration-dbt-date-synapse diff --git a/.gitmodules b/.gitmodules index 5aa8261..9891d2a 100644 --- a/.gitmodules +++ b/.gitmodules @@ -12,6 +12,5 @@ branch = main [submodule "dbt-audit-helper"] path = dbt-audit-helper - url = https://github.com/alieus/dbt-audit-helper - branch = adapter_dispatch - \ No newline at end of file + url = https://github.com/fishtown-analytics/dbt-audit-helper + branch = master diff --git a/dbt-audit-helper b/dbt-audit-helper index a46aeb1..c6b917c 160000 --- a/dbt-audit-helper +++ b/dbt-audit-helper @@ -1 +1 @@ -Subproject commit a46aeb19fb56f5917cd6a2aea5ad7fe1645eb829 +Subproject commit c6b917c62aa18e43b8e73d72cd17a9a8e3721116 diff --git a/dbt-date b/dbt-date index f877c86..e58387d 160000 --- a/dbt-date +++ b/dbt-date @@ -1 +1 @@ -Subproject commit f877c860c253a97a11408fc78be510732933349d +Subproject commit e58387dd6e85b116e693e2675193d71547a09a35 diff --git a/dbt-expectations b/dbt-expectations index 2a58ec6..cd3ff97 160000 --- a/dbt-expectations +++ b/dbt-expectations @@ -1 +1 @@ -Subproject commit 2a58ec6969307330ca97d8cec18b80d488c24624 +Subproject commit cd3ff974a21d5d98d26ab43ce1599fd529dbba22 diff --git a/dbt-utils b/dbt-utils index 4fbaab8..bbba960 160000 --- a/dbt-utils +++ b/dbt-utils @@ -1 +1 @@ -Subproject commit 4fbaab83c8dc79e6c8d225c4880fb796db97001e +Subproject commit bbba960726667abc66b42624f0d36bbb62c37593 diff --git a/integration_tests/dbt_expectations/dbt_project.yml b/integration_tests/dbt_expectations/dbt_project.yml index 8abd016..f025f41 100644 --- a/integration_tests/dbt_expectations/dbt_project.yml +++ b/integration_tests/dbt_expectations/dbt_project.yml @@ -24,5 +24,15 @@ vars: models: dbt_expectations_integration_tests: - data_test_factored: - +materialized: table \ No newline at end of file + schema_tests: + data_test_factored: + +materialized: table + emails: &disabled #TODO + +enabled: false + timeseries_data: *disabled + # anders's weird NoneType error + # (see https://github.com/calogica/dbt-expectations/pull/63) + data_test: *disabled + # Need to implement for synapse + data_test_factored: *disabled + timeseries_data_extended: *disabled diff --git a/macros/dbt_expectations/schema_tests/_generalized/equal_expression.sql b/macros/dbt_expectations/schema_tests/_generalized/equal_expression.sql index 7b7cc57..9cc7aaa 100644 --- a/macros/dbt_expectations/schema_tests/_generalized/equal_expression.sql +++ b/macros/dbt_expectations/schema_tests/_generalized/equal_expression.sql @@ -1,8 +1,10 @@ {%- macro sqlserver__get_select(model, expression, row_condition, group_by) %} select - {% for g in group_by -%} - {{ g }} as col_{{ loop.index }}, - {% endfor -%} + {# {%- if group_by %} #} + {% for g in group_by or [] -%} + {{ g }} as col_{{ loop.index }}, + {% endfor -%} + {# {% endif %} #} {{ expression }} as expression from {{ model }} @@ -10,8 +12,97 @@ where {{ row_condition }} {% endif %} - group by - {% for g in group_by -%} - {{ g }}{% if not loop.last %}, {% endif %} - {% endfor %} -{% endmacro -%} \ No newline at end of file + {%- if group_by %} + {%- if group_by|length > 1 or group_by[0] != "'col'" %} + group by + {% for g in group_by -%} + {{ g }}{% if not loop.last %},{% endif %} + {% endfor %} + {% endif %} + {% endif %} +{% endmacro -%} + +{%- macro sqlserver__test_equal_expression(model, expression, + compare_model, + compare_expression, + group_by, + compare_group_by, + row_condition, + compare_row_condition, + tolerance, + tolerance_percent, + return_difference) -%} + + {%- set compare_model = model if not compare_model else compare_model -%} + {%- set compare_expression = expression if not compare_expression else compare_expression -%} + {%- set compare_row_condition = row_condition if not compare_row_condition else compare_row_condition -%} + {%- set compare_group_by = group_by if not compare_group_by else compare_group_by -%} + + {%- set n_cols = group_by|length if group_by else 0 %} + with a as ( + {{ dbt_expectations.get_select(model, expression, row_condition, group_by) }} + ), + b as ( + {{ dbt_expectations.get_select(compare_model, compare_expression, compare_row_condition, compare_group_by) }} + ), + final as ( + + select + {% for i in range(1, n_cols + 1) -%} + coalesce(a.col_{{ i }}, b.col_{{ i }}) as col_{{ i }}, + {% endfor %} + a.expression, + b.expression as compare_expression, + abs(coalesce(a.expression, 0) - coalesce(b.expression, 0)) as expression_difference, + abs(coalesce(a.expression, 0) - coalesce(b.expression, 0))/ + nullif(a.expression, 0) as expression_difference_percent + from + {% if n_cols > 0 %} + a + full outer join + b on + {% for i in range(1, n_cols + 1) -%} + a.col_{{ i }} = b.col_{{ i }} {% if not loop.last %}and{% endif %} + {% endfor -%} + {% else %} + a cross join b + {% endif %} + ) + -- DEBUG: + -- select * from final + select + {% if return_difference %} + coalesce(sum(expression_difference), 0) + {% else %} + count(*) + {% endif %} + from final + where + {% if tolerance_percent %} + expression_difference_percent > {{ tolerance_percent }} + {% else %} + expression_difference > {{ tolerance }} + {% endif %} +{%- endmacro -%} + +{%- macro synapse__test_equal_expression(model, expression, + compare_model, + compare_expression, + group_by, + compare_group_by, + row_condition, + compare_row_condition, + tolerance, + tolerance_percent, + return_difference) -%} + {% do return( tsql_utils.sqlserver__test_equal_expression(model, expression, + compare_model, + compare_expression, + group_by, + compare_group_by, + row_condition, + compare_row_condition, + tolerance, + tolerance_percent, + return_difference)) -%} +{%- endmacro -%} \ No newline at end of file diff --git a/macros/dbt_expectations/schema_tests/_generalized/expression_is_true.sql b/macros/dbt_expectations/schema_tests/_generalized/expression_is_true.sql new file mode 100644 index 0000000..f8523c2 --- /dev/null +++ b/macros/dbt_expectations/schema_tests/_generalized/expression_is_true.sql @@ -0,0 +1,48 @@ +{% macro sqlserver__expression_is_true(model, expression, test_condition, group_by_columns, row_condition) %} + +{% if test_condition == "= true" %} + {% set test_condition = "= 1" %} +{% endif %} + + +with grouped_expression as ( + + select + {% if group_by_columns %} + {% for group_by_column in group_by_columns -%} + {{ group_by_column }} as col_{{ loop.index }}, + {% endfor -%} + {% endif %} + case when {{ expression }} then 1 else 0 end as expression + from {{ model }} + {%- if row_condition %} + where + {{ row_condition }} + {% endif %} + {% if group_by_columns %} + group by + {% for group_by_column in group_by_columns -%} + {{ group_by_column }}{% if not loop.last %},{% endif %} + {% endfor %} + {% endif %} + +), +validation_errors as ( + + select + * + from + grouped_expression + where + not(expression {{ test_condition }}) + +) + +select count(*) +from validation_errors + +{% endmacro %} + +{% macro synapse__expression_is_true(model, expression, test_condition, group_by_columns, row_condition) %} + {% do return( tsql_utils.sqlserver__expression_is_true(model, expression, test_condition, group_by_columns, row_condition)) %} +{% endmacro %} \ No newline at end of file diff --git a/macros/dbt_expectations/schema_tests/aggregate_functions/expect_column_most_common_value_to_be_in_set.sql b/macros/dbt_expectations/schema_tests/aggregate_functions/expect_column_most_common_value_to_be_in_set.sql new file mode 100644 index 0000000..a08c016 --- /dev/null +++ b/macros/dbt_expectations/schema_tests/aggregate_functions/expect_column_most_common_value_to_be_in_set.sql @@ -0,0 +1,100 @@ +{% macro sqlserver__test_expect_column_most_common_value_to_be_in_set(model, column_name, + value_set, + top_n, + quote_values=False, + data_type="decimal", + row_condition=None + ) -%} + +with value_counts as ( + + select + {% if quote_values -%} + {{ column_name }} + {%- else -%} + cast({{ column_name }} as {{ data_type }}) + {%- endif %} as value_field, + count(*) as value_count + + from {{ model }} + {% if row_condition %} + where {{ row_condition }} + {% endif %} + + group by {% if quote_values -%} + {{ column_name }} + {%- else -%} + cast({{ column_name }} as {{ data_type }}) + {%- endif %} + +), +value_counts_ranked as ( + + select + *, + row_number() over(order by value_count desc) as value_count_rank + from + value_counts + +), +value_count_top_n as ( + + select + value_field + from + value_counts_ranked + where + value_count_rank = {{ top_n }} + +), +set_values as ( + + {% for value in value_set -%} + select + {% if quote_values -%} + '{{ value }}' + {%- else -%} + cast({{ value }} as {{ data_type }}) + {%- endif %} as value_field + {% if not loop.last %}union all{% endif %} + {% endfor %} + +), +unique_set_values as ( + + select distinct value_field + from + set_values + +), +validation_errors as ( + -- values from the model that are not in the set + select + value_field + from + value_count_top_n + where + value_field not in (select value_field from unique_set_values) + +) + +select count(*) as validation_errors +from validation_errors + +{% endmacro %} + +{% macro synapse__test_expect_column_most_common_value_to_be_in_set(model, column_name, + value_set, + top_n, + quote_values, + data_type, + row_condition + ) -%} + {% do return( tsql_utils.sqlserver__test_expect_column_most_common_value_to_be_in_set(model, column_name, + value_set, + top_n, + quote_values=False, + data_type="decimal", + row_condition=None + )) -%} +{% endmacro %} \ No newline at end of file diff --git a/macros/dbt_expectations/schema_tests/aggregate_functions/expect_column_stdev_to_be_between.sql b/macros/dbt_expectations/schema_tests/aggregate_functions/expect_column_stdev_to_be_between.sql new file mode 100644 index 0000000..39db47d --- /dev/null +++ b/macros/dbt_expectations/schema_tests/aggregate_functions/expect_column_stdev_to_be_between.sql @@ -0,0 +1,27 @@ +{% macro sqlserver__test_expect_column_stdev_to_be_between(model, column_name, + min_value, + max_value, + row_condition=None + ) -%} +{% set expression %} +stdev({{ column_name }}) +{% endset %} +{{ dbt_expectations.expression_between(model, + expression=expression, + min_value=min_value, + max_value=max_value, + row_condition=row_condition + ) }} +{% endmacro %} + +{% macro synapse__test_expect_column_stdev_to_be_between(model, column_name, + min_value, + max_value, + row_condition + ) -%} + {% do return( tsql_utils.sqlserver__test_expect_column_stdev_to_be_between(model, column_name, + min_value, + max_value, + row_condition=None + )) -%} +{% endmacro %} \ No newline at end of file diff --git a/macros/dbt_expectations/schema_tests/distributional/expect_column_values_to_be_within_n_moving_stdevs.sql b/macros/dbt_expectations/schema_tests/distributional/expect_column_values_to_be_within_n_moving_stdevs.sql new file mode 100644 index 0000000..84f8834 --- /dev/null +++ b/macros/dbt_expectations/schema_tests/distributional/expect_column_values_to_be_within_n_moving_stdevs.sql @@ -0,0 +1,149 @@ +{%- macro _get_metric_expression(metric_column, take_logs) -%} + +{%- if take_logs %} +{%- set expr = "nullif(" ~ metric_column ~ ", 0)" -%} +coalesce({{ dbt_expectations.log_natural(expr) }}, 0) +{%- else -%} +coalesce({{ metric_column }}, 0) +{%- endif %} + +{%- endmacro -%} + + +{% macro sqlserver__test_expect_column_values_to_be_within_n_moving_stdevs(model, + column_name, + date_column_name, + period, + lookback_periods, + trend_periods, + test_periods, + sigma_threshold, + sigma_threshold_upper, + sigma_threshold_lower, + take_diffs, + take_logs + ) %} + +{%- set sigma_threshold_upper = sigma_threshold_upper if sigma_threshold_upper else sigma_threshold -%} +{%- set sigma_threshold_lower = sigma_threshold_lower if sigma_threshold_lower else -1 * sigma_threshold -%} + + + +with grouped_metric_values as ( + + select + {{ dbt_utils.date_trunc(period, date_column_name) }} as metric_period, + sum({{ column_name }}) as agg_metric_value + from + {{ model }} + group by + {{ dbt_utils.date_trunc(period, date_column_name) }} + +), + +grouped_metric_values_with_priors as ( + + select + *, + lag(agg_metric_value, {{ lookback_periods }}) over(order by metric_period) as prior_agg_metric_value + from + grouped_metric_values d + +), + +{%- if take_diffs %} + +metric_values as ( + + select + *, + {{ dbt_expectations._get_metric_expression("agg_metric_value", take_logs) }} + - + {{ dbt_expectations._get_metric_expression("prior_agg_metric_value", take_logs) }} + as metric_test_value + from + grouped_metric_values_with_priors d +), + +{%- else %} + +metric_values as ( + select + *, + {{ dbt_expectations._get_metric_expression("agg_metric_value", take_logs) }} + from + grouped_metric_values +), +{%- endif %} + +metric_moving_calcs as ( + + select + *, + avg(metric_test_value) + over(order by metric_period rows + between {{ trend_periods }} preceding and 1 preceding) as metric_test_rolling_average, + stdev(metric_test_value) + over(order by metric_period rows + between {{ trend_periods }} preceding and 1 preceding) as metric_test_rolling_stddev + from + metric_values + +), +metric_sigma as ( + + select + *, + (metric_test_value - metric_test_rolling_average) as metric_test_delta, + (metric_test_value - metric_test_rolling_average)/nullif(metric_test_rolling_stddev, 0) as metric_test_sigma + from + metric_moving_calcs + +) +select + count(*) +from + metric_sigma +where + + metric_period >= cast( + {{ dbt_utils.dateadd(period, -test_periods, dbt_utils.date_trunc(period, dbt_date.now())) }} + as {{ dbt_utils.type_timestamp() }}) + and + metric_period < {{ dbt_utils.date_trunc(period, dbt_date.now()) }} + and + + not ( + metric_test_sigma >= {{ sigma_threshold_lower }} and + metric_test_sigma <= {{ sigma_threshold_upper }} + ) +{%- endmacro -%} + + +{% macro synapse__test_expect_column_values_to_be_within_n_moving_stdevs(model, + column_name, + date_column_name, + period, + lookback_periods, + trend_periods, + test_periods, + sigma_threshold, + sigma_threshold_upper, + sigma_threshold_lower, + take_diffs, + take_logs + ) %} + {% do return( tsql_utils.sqlserver__test_expect_column_values_to_be_within_n_moving_stdevs(model, + column_name, + date_column_name, + period, + lookback_periods, + trend_periods, + test_periods, + sigma_threshold, + sigma_threshold_upper, + sigma_threshold_lower, + take_diffs, + take_logs + )) %} +{%- endmacro -%} diff --git a/macros/dbt_expectations/schema_tests/distributional/expect_column_values_to_be_within_n_stdevs.sql b/macros/dbt_expectations/schema_tests/distributional/expect_column_values_to_be_within_n_stdevs.sql new file mode 100644 index 0000000..3ac8d14 --- /dev/null +++ b/macros/dbt_expectations/schema_tests/distributional/expect_column_values_to_be_within_n_stdevs.sql @@ -0,0 +1,63 @@ + +{% macro sqlserver__test_expect_column_values_to_be_within_n_stdevs(model, + column_name, + group_by, + sigma_threshold + ) %} +with metric_values as ( + + {% if group_by -%} + select + {{ group_by }} as metric_date, + sum({{ column_name }}) as {{ column_name }} + from + {{ model }} + group by + {{ group_by }} + {%- else -%} + select + {{ column_name }} as {{ column_name }} + from + {{ model }} + {%- endif %} + +), +metric_values_with_statistics as ( + + select + *, + avg({{ column_name }}) over() as {{ column_name }}_average, + stdev({{ column_name }}) over() as {{ column_name }}_stddev + from + metric_values + +), +metric_values_z_scores as ( + + select + *, + ({{ column_name }} - {{ column_name }}_average)/{{ column_name }}_stddev as {{ column_name }}_sigma + from + metric_values_with_statistics + +) +select + count(*) as error_count +from + metric_values_z_scores +where + abs({{ column_name }}_sigma) > {{ sigma_threshold }} +{%- endmacro %} + + +{% macro synapse__test_expect_column_values_to_be_within_n_stdevs(model, + column_name, + group_by, + sigma_threshold + ) %} + {% do return( tsql_utils.sqlserver__test_expect_column_values_to_be_within_n_stdevs(model, + column_name, + group_by, + sigma_threshold + )) %} +{%- endmacro %} \ No newline at end of file diff --git a/macros/dbt_expectations/schema_tests/multi-column/expect_select_column_values_to_be_unique_within_record.sql b/macros/dbt_expectations/schema_tests/multi-column/expect_select_column_values_to_be_unique_within_record.sql new file mode 100644 index 0000000..7443ef8 --- /dev/null +++ b/macros/dbt_expectations/schema_tests/multi-column/expect_select_column_values_to_be_unique_within_record.sql @@ -0,0 +1,82 @@ +{% macro sqlserver__test_expect_select_column_values_to_be_unique_within_record(model, + column_list, + quote_columns, + ignore_row_if, + row_condition + ) %} + +{% if not quote_columns %} + {%- set columns=column_list %} +{% elif quote_columns %} + {%- set columns=[] %} + {% for column in column_list -%} + {% set columns = columns.append( adapter.quote(column) ) %} + {%- endfor %} +{% else %} + {{ exceptions.raise_compiler_error( + "`quote_columns` argument for unique_combination_of_columns test must be one of [True, False] Got: '" ~ quote_columns ~"'.'" + ) }} +{% endif %} + +with column_values as ( + + select + row_number() over(order by (SELECT 'does order really matter here?')) as row_index, + {% for column in columns -%} + {{ column }}{% if not loop.last %},{% endif %} + {%- endfor %} + from {{ model }} + where 1=1 + {% if row_condition %} + and {{ row_condition }} + {% endif %} + {% if ignore_row_if == "all_values_are_missing" %} + and + ( + {% for column in columns -%} + {{ column }} is not null{% if not loop.last %} and {% endif %} + {%- endfor %} + ) + {% elif ignore_row_if == "any_value_is_missing" %} + and + ( + {% for column in columns -%} + {{ column }} is not null{% if not loop.last %} or {% endif %} + {%- endfor %} + ) + {% endif %} + +), +unpivot_columns as ( + + {% for column in columns %} + select row_index, '{{ column }}' as column_name, {{ column }} as column_value from column_values + {% if not loop.last %}union all{% endif %} + {% endfor %} +), +validation_errors as ( + + select + row_index, + count(distinct column_value) as column_values + from unpivot_columns + group by row_index + having count(distinct column_value) < {{ columns | length }} + +) +select count(*) from validation_errors +{% endmacro %} + +{% macro synapse__test_expect_select_column_values_to_be_unique_within_record(model, + column_list, + quote_columns, + ignore_row_if, + row_condition + ) %} + {% do return( tsql_utils.sqlserver__test_expect_select_column_values_to_be_unique_within_record(model, + column_list, + quote_columns, + ignore_row_if, + row_condition + )) %} +{% endmacro %} \ No newline at end of file diff --git a/macros/dbt_expectations/schema_tests/table_shape/expect_grouped_row_values_to_have_recent_data.sql b/macros/dbt_expectations/schema_tests/table_shape/expect_grouped_row_values_to_have_recent_data.sql new file mode 100644 index 0000000..91a751b --- /dev/null +++ b/macros/dbt_expectations/schema_tests/table_shape/expect_grouped_row_values_to_have_recent_data.sql @@ -0,0 +1,31 @@ +{% macro sqlserver__test_expect_grouped_row_values_to_have_recent_data(model, group_by, timestamp_column, datepart, interval) %} +with latest_grouped_timestamps as ( + + select + {%- for g in group_by %} + {{ g }}, + {%- endfor %} + max({{ timestamp_column }}) as latest_timestamp_column + from + {{ model }} + group by + {%- for g in group_by %} + {{g}}{%- if not loop.last %}, {%- endif %} + {%- endfor %} + +), +validation_errors as ( + + select * + from + latest_grouped_timestamps + where + latest_timestamp_column < {{ dbt_utils.dateadd(datepart, interval * -1, dbt_date.now()) }} + +) +select count(*) from validation_errors +{% endmacro %} + +{% macro synapse__test_expect_grouped_row_values_to_have_recent_data(model, group_by, timestamp_column, datepart, interval) %} + {% do return( tsql_utils.sqlserver__test_expect_grouped_row_values_to_have_recent_data(model, group_by, timestamp_column, datepart, interval)) %} +{% endmacro %} \ No newline at end of file