Skip to content

Commit 5b1cde8

Browse files
[FEATURE] Add mean to metrics API (#10961)
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
1 parent f3189ec commit 5b1cde8

File tree

4 files changed

+83
-0
lines changed

4 files changed

+83
-0
lines changed
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
11
from .batch.row_count import BatchRowCount
2+
from .column_aggregate.mean import ColumnValuesMean
23
from .column_values.non_null import ColumnValuesNonNull, ColumnValuesNonNullCount
34
from .metric import Metric

great_expectations/metrics/column_aggregate/__init__.py

Whitespace-only changes.
Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
from great_expectations.metrics.domain import ColumnValues
2+
from great_expectations.metrics.metric import Metric
3+
from great_expectations.metrics.metric_results import MetricResult
4+
5+
6+
class ColumnValuesMeanResult(MetricResult[float]): ...
7+
8+
9+
class ColumnValuesMean(Metric[ColumnValuesMeanResult], ColumnValues):
10+
name = "column.mean"
Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,72 @@
1+
import pandas
2+
3+
from great_expectations.metrics.column_aggregate.mean import (
4+
ColumnValuesMean,
5+
ColumnValuesMeanResult,
6+
)
7+
from great_expectations.metrics.metric_results import MetricErrorResult
8+
from tests.integration.conftest import parameterize_batch_for_data_sources
9+
from tests.integration.test_utils.data_source_config import (
10+
BigQueryDatasourceTestConfig,
11+
DatabricksDatasourceTestConfig,
12+
DataSourceTestConfig,
13+
MSSQLDatasourceTestConfig,
14+
PandasDataFrameDatasourceTestConfig,
15+
PostgreSQLDatasourceTestConfig,
16+
SnowflakeDatasourceTestConfig,
17+
SparkFilesystemCsvDatasourceTestConfig,
18+
SqliteDatasourceTestConfig,
19+
)
20+
21+
DATA_FRAME = pandas.DataFrame(
22+
{
23+
"id": [1, 2, 3, 4],
24+
"number": [1, 2, 3, 4],
25+
"string": ["a", "b", "c", "d"],
26+
},
27+
)
28+
29+
DATA_SOURCES_WITHOUT_SPARK_DATABRICKS_SQLITE: list[DataSourceTestConfig] = [
30+
BigQueryDatasourceTestConfig(),
31+
MSSQLDatasourceTestConfig(),
32+
PostgreSQLDatasourceTestConfig(),
33+
SnowflakeDatasourceTestConfig(),
34+
PandasDataFrameDatasourceTestConfig(),
35+
]
36+
37+
DATA_SOURCES: list[DataSourceTestConfig] = DATA_SOURCES_WITHOUT_SPARK_DATABRICKS_SQLITE + [
38+
SparkFilesystemCsvDatasourceTestConfig(),
39+
DatabricksDatasourceTestConfig(),
40+
SqliteDatasourceTestConfig(),
41+
]
42+
43+
44+
@parameterize_batch_for_data_sources(
45+
data_source_configs=DATA_SOURCES,
46+
data=DATA_FRAME,
47+
)
48+
def test_mean_success(batch_for_datasource) -> None:
49+
batch = batch_for_datasource
50+
metric = ColumnValuesMean(batch_id=batch.id, column="number")
51+
metric_result = batch.compute_metrics(metric)
52+
assert isinstance(metric_result, ColumnValuesMeanResult)
53+
assert metric_result.value == 2.5
54+
55+
56+
# For spark, when computing the mean, if it fails, the metric name changes from
57+
# `column.mean` to `column.aggregate.mean`.
58+
# There is a bug to track fixing this: https://greatexpectations.atlassian.net/browse/GX-448
59+
# For databricks, when computing the mean, any non-numeric values are ignored and the result is
60+
# None, which will cause a crash later when trying to set the value of the MetricResult
61+
# (not MetricErrorResult) to None.
62+
# For sqlite, when computing the mean, any non-numeric values are ignored (or maybe treated
63+
# as 0) so we don't an error.
64+
@parameterize_batch_for_data_sources(
65+
data_source_configs=DATA_SOURCES_WITHOUT_SPARK_DATABRICKS_SQLITE,
66+
data=DATA_FRAME,
67+
)
68+
def test_mean_failure(batch_for_datasource) -> None:
69+
batch = batch_for_datasource
70+
metric = ColumnValuesMean(batch_id=batch.id, column="string")
71+
metric_result = batch.compute_metrics(metric)
72+
assert isinstance(metric_result, MetricErrorResult)

0 commit comments

Comments
 (0)