|
| 1 | +import pandas |
| 2 | + |
| 3 | +from great_expectations.metrics.column_aggregate.mean import ( |
| 4 | + ColumnValuesMean, |
| 5 | + ColumnValuesMeanResult, |
| 6 | +) |
| 7 | +from great_expectations.metrics.metric_results import MetricErrorResult |
| 8 | +from tests.integration.conftest import parameterize_batch_for_data_sources |
| 9 | +from tests.integration.test_utils.data_source_config import ( |
| 10 | + BigQueryDatasourceTestConfig, |
| 11 | + DatabricksDatasourceTestConfig, |
| 12 | + DataSourceTestConfig, |
| 13 | + MSSQLDatasourceTestConfig, |
| 14 | + PandasDataFrameDatasourceTestConfig, |
| 15 | + PostgreSQLDatasourceTestConfig, |
| 16 | + SnowflakeDatasourceTestConfig, |
| 17 | + SparkFilesystemCsvDatasourceTestConfig, |
| 18 | + SqliteDatasourceTestConfig, |
| 19 | +) |
| 20 | + |
| 21 | +DATA_FRAME = pandas.DataFrame( |
| 22 | + { |
| 23 | + "id": [1, 2, 3, 4], |
| 24 | + "number": [1, 2, 3, 4], |
| 25 | + "string": ["a", "b", "c", "d"], |
| 26 | + }, |
| 27 | +) |
| 28 | + |
| 29 | +DATA_SOURCES_WITHOUT_SPARK_DATABRICKS_SQLITE: list[DataSourceTestConfig] = [ |
| 30 | + BigQueryDatasourceTestConfig(), |
| 31 | + MSSQLDatasourceTestConfig(), |
| 32 | + PostgreSQLDatasourceTestConfig(), |
| 33 | + SnowflakeDatasourceTestConfig(), |
| 34 | + PandasDataFrameDatasourceTestConfig(), |
| 35 | +] |
| 36 | + |
| 37 | +DATA_SOURCES: list[DataSourceTestConfig] = DATA_SOURCES_WITHOUT_SPARK_DATABRICKS_SQLITE + [ |
| 38 | + SparkFilesystemCsvDatasourceTestConfig(), |
| 39 | + DatabricksDatasourceTestConfig(), |
| 40 | + SqliteDatasourceTestConfig(), |
| 41 | +] |
| 42 | + |
| 43 | + |
| 44 | +@parameterize_batch_for_data_sources( |
| 45 | + data_source_configs=DATA_SOURCES, |
| 46 | + data=DATA_FRAME, |
| 47 | +) |
| 48 | +def test_mean_success(batch_for_datasource) -> None: |
| 49 | + batch = batch_for_datasource |
| 50 | + metric = ColumnValuesMean(batch_id=batch.id, column="number") |
| 51 | + metric_result = batch.compute_metrics(metric) |
| 52 | + assert isinstance(metric_result, ColumnValuesMeanResult) |
| 53 | + assert metric_result.value == 2.5 |
| 54 | + |
| 55 | + |
| 56 | +# For spark, when computing the mean, if it fails, the metric name changes from |
| 57 | +# `column.mean` to `column.aggregate.mean`. |
| 58 | +# There is a bug to track fixing this: https://greatexpectations.atlassian.net/browse/GX-448 |
| 59 | +# For databricks, when computing the mean, any non-numeric values are ignored and the result is |
| 60 | +# None, which will cause a crash later when trying to set the value of the MetricResult |
| 61 | +# (not MetricErrorResult) to None. |
| 62 | +# For sqlite, when computing the mean, any non-numeric values are ignored (or maybe treated |
| 63 | +# as 0) so we don't an error. |
| 64 | +@parameterize_batch_for_data_sources( |
| 65 | + data_source_configs=DATA_SOURCES_WITHOUT_SPARK_DATABRICKS_SQLITE, |
| 66 | + data=DATA_FRAME, |
| 67 | +) |
| 68 | +def test_mean_failure(batch_for_datasource) -> None: |
| 69 | + batch = batch_for_datasource |
| 70 | + metric = ColumnValuesMean(batch_id=batch.id, column="string") |
| 71 | + metric_result = batch.compute_metrics(metric) |
| 72 | + assert isinstance(metric_result, MetricErrorResult) |
0 commit comments