From 992160955851abaf37c748366a18ab58b41b8d78 Mon Sep 17 00:00:00 2001 From: Katharine Xiao <2405771+katxiao@users.noreply.github.com> Date: Thu, 14 Jul 2022 12:22:11 -0400 Subject: [PATCH 1/5] Add correlation similarity to column pairs and update --- sdmetrics/column_pairs/base.py | 20 ++++- .../column_pairs/statistical/__init__.py | 2 + .../statistical/correlation_similarity.py | 90 +++++++++++++++++++ .../statistical/statistic_similarity.py | 22 ++--- tests/unit/column_pairs/__init__.py | 1 + .../unit/column_pairs/statistical/__init__.py | 1 + .../test_correlation_similarity.py | 90 +++++++++++++++++++ tests/unit/single_column/__init__.py | 1 + .../single_column/statistical/__init__.py | 1 + .../statistical/test_statistic_similarity.py | 10 +-- tests/utils.py | 28 ++++++ 11 files changed, 247 insertions(+), 19 deletions(-) create mode 100644 sdmetrics/column_pairs/statistical/correlation_similarity.py create mode 100644 tests/unit/column_pairs/__init__.py create mode 100644 tests/unit/column_pairs/statistical/__init__.py create mode 100644 tests/unit/column_pairs/statistical/test_correlation_similarity.py create mode 100644 tests/unit/single_column/__init__.py create mode 100644 tests/unit/single_column/statistical/__init__.py create mode 100644 tests/utils.py diff --git a/sdmetrics/column_pairs/base.py b/sdmetrics/column_pairs/base.py index e101f797..c1712524 100644 --- a/sdmetrics/column_pairs/base.py +++ b/sdmetrics/column_pairs/base.py @@ -35,7 +35,25 @@ def compute(real_data, synthetic_data): pandas.DataFrame with 2 columns. Returns: - Union[float, tuple[float]]: + float: Metric output. """ raise NotImplementedError() + + @classmethod + def compute_breakdown(cls, real_data, synthetic_data): + """Compute the breakdown of this metric. + + Args: + real_data (pandas.DataFrame): + The values from the real dataset, passed as pandas.DataFrame + with 2 columns. + synthetic_data (pandas.DataFrame): + The values from the synthetic dataset, passed as a + pandas.DataFrame with 2 columns. + + Returns: + dict + A mapping of the metric output. Must contain the key 'score'. + """ + return {'score': cls.compute(real_data, synthetic_data)} diff --git a/sdmetrics/column_pairs/statistical/__init__.py b/sdmetrics/column_pairs/statistical/__init__.py index e79ac684..c9175c95 100644 --- a/sdmetrics/column_pairs/statistical/__init__.py +++ b/sdmetrics/column_pairs/statistical/__init__.py @@ -1,11 +1,13 @@ """Statistical Metrics to compare column pairs.""" from sdmetrics.column_pairs.statistical.contingency_similarity import ContingencySimilarity +from sdmetrics.column_pairs.statistical.correlation_similarity import CorrelationSimilarity from sdmetrics.column_pairs.statistical.kl_divergence import ( ContinuousKLDivergence, DiscreteKLDivergence) __all__ = [ 'ContingencySimilarity', 'ContinuousKLDivergence', + 'CorrelationSimilarity', 'DiscreteKLDivergence', ] diff --git a/sdmetrics/column_pairs/statistical/correlation_similarity.py b/sdmetrics/column_pairs/statistical/correlation_similarity.py new file mode 100644 index 00000000..144ac66b --- /dev/null +++ b/sdmetrics/column_pairs/statistical/correlation_similarity.py @@ -0,0 +1,90 @@ +"""Correlation Similarity Metric.""" + +import pandas as pd +from scipy.stats import pearsonr, spearmanr + +from sdmetrics.column_pairs.base import ColumnPairsMetric +from sdmetrics.goal import Goal +from sdmetrics.utils import is_datetime + + +class CorrelationSimilarity(ColumnPairsMetric): + """Correlation similarity metric. + + Attributes: + name (str): + Name to use when reports about this metric are printed. + goal (sdmetrics.goal.Goal): + The goal of this metric. + min_value (Union[float, tuple[float]]): + Minimum value or values that this metric can take. + max_value (Union[float, tuple[float]]): + Maximum value or values that this metric can take. + """ + + name = 'CorrelationSimilarity' + goal = Goal.MAXIMIZE + min_value = 0.0 + max_value = 1.0 + + @classmethod + def compute(cls, real_data, synthetic_data, coefficient='Pearson'): + """Compare the correlation similarity of two continuous columns. + + Args: + real_data (Union[numpy.ndarray, pandas.Series]): + The values from the real dataset. + synthetic_data (Union[numpy.ndarray, pandas.Series]): + The values from the synthetic dataset. + + Returns: + float: + The correlation similarity of the two columns. + """ + real_data = pd.Series(real_data).dropna() + synthetic_data = pd.Series(synthetic_data).dropna() + + if is_datetime(real_data): + real_data = pd.to_numeric(real_data) + synthetic_data = pd.to_numeric(synthetic_data) + + correlation_fn = None + if coefficient == 'Pearson': + correlation_fn = pearsonr + elif coefficient == 'Spearman': + correlation_fn = spearmanr + else: + raise ValueError(f'requested coefficient {coefficient} is not valid. ' + 'Please choose either Pearson or Spearman.') + + return correlation_fn(real_data, synthetic_data) + + @classmethod + def compute_breakdown(cls, real_data, synthetic_data, coefficient='Pearson'): + """Compare the breakdown of correlation similarity of two continuous columns. + + Args: + real_data (Union[numpy.ndarray, pandas.Series]): + The values from the real dataset. + synthetic_data (Union[numpy.ndarray, pandas.Series]): + The values from the synthetic dataset. + + Returns: + dict: + A dict containing the score, and the real and synthetic metric values. + """ + return {'score': cls.compute(real_data, synthetic_data, coefficient)} + + @classmethod + def normalize(cls, raw_score): + """Return the `raw_score` as is, since it is already normalized. + + Args: + raw_score (float): + The value of the metric from `compute`. + + Returns: + float: + The normalized value of the metric + """ + return super().normalize(raw_score) diff --git a/sdmetrics/single_column/statistical/statistic_similarity.py b/sdmetrics/single_column/statistical/statistic_similarity.py index 678325fc..53450236 100644 --- a/sdmetrics/single_column/statistical/statistic_similarity.py +++ b/sdmetrics/single_column/statistical/statistic_similarity.py @@ -19,20 +19,15 @@ class StatisticSimilarity(SingleColumnMetric): Minimum value or values that this metric can take. max_value (Union[float, tuple[float]]): Maximum value or values that this metric can take. - statistic (str): - The statistic to compute the metric on (mean, std, or median). Defaults to mean. """ name = 'StatisticSimilarity' goal = Goal.MAXIMIZE min_value = 0.0 max_value = 1.0 - statistic = 'mean' - def __init__(self, statistic='mean'): - self.statistic = statistic - - def compute(self, real_data, synthetic_data): + @classmethod + def compute(cls, real_data, synthetic_data, statistic='mean'): """Compare the statistic similarity of two continuous columns. Args: @@ -45,9 +40,10 @@ def compute(self, real_data, synthetic_data): float: The statistical similarity of the two columns. """ - return self.compute_breakdown(real_data, synthetic_data)['score'] + return cls.compute_breakdown(real_data, synthetic_data, statistic)['score'] - def compute_breakdown(self, real_data, synthetic_data): + @staticmethod + def compute_breakdown(real_data, synthetic_data, statistic='mean'): """Compare the breakdown of statistic similarity of two continuous columns. Args: @@ -67,17 +63,17 @@ def compute_breakdown(self, real_data, synthetic_data): real_data = pd.to_numeric(real_data) synthetic_data = pd.to_numeric(synthetic_data) - if self.statistic == 'mean': + if statistic == 'mean': score_real = real_data.mean() score_synthetic = synthetic_data.mean() - elif self.statistic == 'std': + elif statistic == 'std': score_real = real_data.std() score_synthetic = synthetic_data.std() - elif self.statistic == 'median': + elif statistic == 'median': score_real = real_data.median() score_synthetic = synthetic_data.median() else: - raise ValueError(f'requested statistic {self.statistic} is not valid. ' + raise ValueError(f'requested statistic {statistic} is not valid. ' 'Please choose either mean, std, or median.') score = 1 - abs(score_real - score_synthetic) / (real_data.max() - real_data.min()) diff --git a/tests/unit/column_pairs/__init__.py b/tests/unit/column_pairs/__init__.py new file mode 100644 index 00000000..aac7ef87 --- /dev/null +++ b/tests/unit/column_pairs/__init__.py @@ -0,0 +1 @@ +"""Unit tests for the column pairs module.""" diff --git a/tests/unit/column_pairs/statistical/__init__.py b/tests/unit/column_pairs/statistical/__init__.py new file mode 100644 index 00000000..13be4859 --- /dev/null +++ b/tests/unit/column_pairs/statistical/__init__.py @@ -0,0 +1 @@ +"""Unit tests for the column pairs statistical metrics.""" diff --git a/tests/unit/column_pairs/statistical/test_correlation_similarity.py b/tests/unit/column_pairs/statistical/test_correlation_similarity.py new file mode 100644 index 00000000..525e9a45 --- /dev/null +++ b/tests/unit/column_pairs/statistical/test_correlation_similarity.py @@ -0,0 +1,90 @@ +from unittest.mock import Mock, patch + +import pandas as pd + +from sdmetrics.column_pairs.statistical import CorrelationSimilarity +from tests.utils import SeriesMatcher + + +class TestCorrelationSimilarity: + + @patch('sdmetrics.column_pairs.statistical.correlation_similarity.pearsonr') + def test_compute(self, pearson_mock): + """Test the ``compute`` method. + + Expect that the selected coefficient is used to compare the real and synthetic data. + + Setup: + - Patch the ``scipy.stats.pearsonr`` method to return a test result. + + Input: + - Real data. + - Synthetic data. + + Output: + - The evaluated metric. + """ + # Setup + real_data = pd.Series([1.0, 2.4, 2.6, 0.8]) + synthetic_data = pd.Series([0.9, 1.8, 3.1, 5.0]) + + metric = CorrelationSimilarity() + + # Run + result = metric.compute(real_data, synthetic_data, coefficient='Pearson') + + # Assert + pearson_mock.assert_called_once_with( + SeriesMatcher(real_data), SeriesMatcher(synthetic_data)) + assert result == pearson_mock.return_value + + def test_compute_breakdown(self): + """Test the ``compute_breakdown`` method. + + Expect that the selected coefficient is used to compare the real and synthetic data. + + Setup: + - Mock the ``compute`` method to return a test score. + + Input: + - Mocked real data. + - Mocked synthetic data. + + Output: + - A mapping of the metric results, containing the score and the real and synthetic results. + """ + # Setup + test_score = 0.2 + metric = CorrelationSimilarity() + + # Run + with patch.object(CorrelationSimilarity, 'compute', return_value=test_score): + result = metric.compute_breakdown(Mock(), Mock(), coefficient='Pearson') + + # Assert + assert result == {'score': test_score} + + @patch( + 'sdmetrics.column_pairs.statistical.correlation_similarity.ColumnPairsMetric.normalize' + ) + def test_normalize(self, normalize_mock): + """Test the ``normalize`` method. + + Expect that the inherited ``normalize`` method is called. + + Input: + - Raw score + + Output: + - The output of the inherited ``normalize`` method. + """ + # Setup + metric = CorrelationSimilarity() + raw_score = 0.9 + + # Run + result = metric.normalize(raw_score) + + # Assert + normalize_mock.assert_called_once_with(raw_score) + assert result == normalize_mock.return_value diff --git a/tests/unit/single_column/__init__.py b/tests/unit/single_column/__init__.py new file mode 100644 index 00000000..051ac498 --- /dev/null +++ b/tests/unit/single_column/__init__.py @@ -0,0 +1 @@ +"""Unit tests for the single column module.""" diff --git a/tests/unit/single_column/statistical/__init__.py b/tests/unit/single_column/statistical/__init__.py new file mode 100644 index 00000000..9ba140e4 --- /dev/null +++ b/tests/unit/single_column/statistical/__init__.py @@ -0,0 +1 @@ +"""Unit tests for the statistical single column metrics.""" diff --git a/tests/unit/single_column/statistical/test_statistic_similarity.py b/tests/unit/single_column/statistical/test_statistic_similarity.py index 9c493291..a85d5368 100644 --- a/tests/unit/single_column/statistical/test_statistic_similarity.py +++ b/tests/unit/single_column/statistical/test_statistic_similarity.py @@ -26,10 +26,10 @@ def test_compute_breakdown(self): real_data = pd.Series([1.0, 2.4, 2.6, 0.8]) synthetic_data = pd.Series([0.9, 1.8, 3.1, 5.0]) - metric = StatisticSimilarity(statistic='mean') + metric = StatisticSimilarity() # Run - result = metric.compute_breakdown(real_data, synthetic_data) + result = metric.compute_breakdown(real_data, synthetic_data, statistic='mean') # Assert assert result == {'score': 1 - (2.7 - 1.7) / 1.8, 'real': 1.7, 'synthetic': 2.7} @@ -52,11 +52,11 @@ def test_compute(self): # Setup metric_breakdown = {'score': 0.56, 'real': 1.7, 'synthetic': 2.7} - metric = StatisticSimilarity(statistic='mean') + metric = StatisticSimilarity() # Run with patch.object(StatisticSimilarity, 'compute_breakdown', return_value=metric_breakdown): - result = metric.compute(Mock(), Mock()) + result = metric.compute(Mock(), Mock(), statistic='mean') # Assert assert result == 0.56 @@ -74,7 +74,7 @@ def test_normalize(self, normalize_mock): - The output of the inherited ``normalize`` method. """ # Setup - metric = StatisticSimilarity(statistic='mean') + metric = StatisticSimilarity() raw_score = 0.9 # Run diff --git a/tests/utils.py b/tests/utils.py new file mode 100644 index 00000000..37f6a71c --- /dev/null +++ b/tests/utils.py @@ -0,0 +1,28 @@ +"""Utils for testing.""" +import pandas as pd + + +class DataFrameMatcher: + """Match a given Pandas DataFrame in a mock function call.""" + + def __init__(self, df): + """Initialize the DataFrame.""" + self.df = df + + def __eq__(self, other): + """Assert equality using pandas testing module.""" + pd.testing.assert_frame_equal(self.df, other) + return True + + +class SeriesMatcher: + """Match a given Pandas Series in a mock function call.""" + + def __init__(self, data): + """Initialize the Series.""" + self.data = data + + def __eq__(self, other): + """Assert equality using pandas testing module.""" + pd.testing.assert_series_equal(self.data, other) + return True From 41b04b57d40f9e83c2b61e64c356756f7de1ee87 Mon Sep 17 00:00:00 2001 From: Katharine Xiao <2405771+katxiao@users.noreply.github.com> Date: Thu, 14 Jul 2022 12:25:40 -0400 Subject: [PATCH 2/5] Add unit test for column pairs --- tests/unit/column_pairs/test_base.py | 32 ++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) create mode 100644 tests/unit/column_pairs/test_base.py diff --git a/tests/unit/column_pairs/test_base.py b/tests/unit/column_pairs/test_base.py new file mode 100644 index 00000000..d0d248aa --- /dev/null +++ b/tests/unit/column_pairs/test_base.py @@ -0,0 +1,32 @@ +from unittest.mock import Mock, patch + +from sdmetrics.column_pairs.base import ColumnPairsMetric + + +class TestColumnPairsMetric: + + def test_compute_breakdown(self): + """Test the ``compute_breakdown`` method. + + Expect a breakdown dictionary is returned that contains the score. + + Setup: + - Mock the ``compute`` method to return a fake score. + + Input: + - Real data. + - Synthetic data. + + Output: + - The evaluated metric. + """ + # Setup + metric = ColumnPairsMetric() + test_metric_score = 0.5 + + # Run + with patch.object(ColumnPairsMetric, 'compute', return_value=test_metric_score): + result = metric.compute_breakdown(Mock(), Mock()) + + # Assert + assert result == {'score': test_metric_score} From 4720fab2736009d3aee85f451586f81fc6467c87 Mon Sep 17 00:00:00 2001 From: Katharine Xiao <2405771+katxiao@users.noreply.github.com> Date: Thu, 14 Jul 2022 12:29:39 -0400 Subject: [PATCH 3/5] Add single table metric --- sdmetrics/single_table/multi_column_pairs.py | 25 ++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/sdmetrics/single_table/multi_column_pairs.py b/sdmetrics/single_table/multi_column_pairs.py index f96a90fc..7443bf13 100644 --- a/sdmetrics/single_table/multi_column_pairs.py +++ b/sdmetrics/single_table/multi_column_pairs.py @@ -186,3 +186,28 @@ class ContingencySimilarity(MultiColumnPairsMetric): field_types = ('boolean', 'categorical') column_pairs_metric = column_pairs.statistical.contingency_similarity.ContingencySimilarity + + +class CorrelationSimilarity(MultiColumnPairsMetric): + """MultiColumnPairsMetric based on ColumnPairs CorrelationSimilarity. + + This computes the correlation between column pairs based on the specified coefficient, + which defaults to 'Pearson'. + + Attributes: + name (str): + Name to use when reports about this metric are printed. + goal (sdmetrics.goal.Goal): + The goal of this metric. + min_value (Union[float, tuple[float]]): + Minimum value or values that this metric can take. + max_value (Union[float, tuple[float]]): + Maximum value or values that this metric can take. + column_pairs_metric (sdmetrics.column_pairs.base.ColumnPairsMetric): + ColumnPairs DiscreteKLDivergence. + field_types (dict): + Field types to which the SingleColumn metric will be applied. + """ + + field_types = ('numerical', 'datetime') + column_pairs_metric = column_pairs.statistical.correlation_similarity.CorrelationSimilarity From e0516c3459b9bffed7cba2d3158b6fa412978511 Mon Sep 17 00:00:00 2001 From: Katharine Xiao <2405771+katxiao@users.noreply.github.com> Date: Thu, 14 Jul 2022 15:30:54 -0400 Subject: [PATCH 4/5] Update metric --- .../statistical/correlation_similarity.py | 31 ++++++---- .../test_correlation_similarity.py | 58 ++++++++++++------- 2 files changed, 55 insertions(+), 34 deletions(-) diff --git a/sdmetrics/column_pairs/statistical/correlation_similarity.py b/sdmetrics/column_pairs/statistical/correlation_similarity.py index 144ac66b..d6ef20a2 100644 --- a/sdmetrics/column_pairs/statistical/correlation_similarity.py +++ b/sdmetrics/column_pairs/statistical/correlation_similarity.py @@ -28,8 +28,8 @@ class CorrelationSimilarity(ColumnPairsMetric): max_value = 1.0 @classmethod - def compute(cls, real_data, synthetic_data, coefficient='Pearson'): - """Compare the correlation similarity of two continuous columns. + def compute_breakdown(cls, real_data, synthetic_data, coefficient='Pearson'): + """Compare the breakdown of correlation similarity of two continuous columns. Args: real_data (Union[numpy.ndarray, pandas.Series]): @@ -38,11 +38,12 @@ def compute(cls, real_data, synthetic_data, coefficient='Pearson'): The values from the synthetic dataset. Returns: - float: - The correlation similarity of the two columns. + dict: + A dict containing the score, and the real and synthetic metric values. """ - real_data = pd.Series(real_data).dropna() - synthetic_data = pd.Series(synthetic_data).dropna() + real_data[pd.isna(real_data)] = 0.0 + synthetic_data[pd.isna(synthetic_data)] = 0.0 + column1, column2 = real_data.columns[:2] if is_datetime(real_data): real_data = pd.to_numeric(real_data) @@ -57,11 +58,17 @@ def compute(cls, real_data, synthetic_data, coefficient='Pearson'): raise ValueError(f'requested coefficient {coefficient} is not valid. ' 'Please choose either Pearson or Spearman.') - return correlation_fn(real_data, synthetic_data) + correlation_real = correlation_fn(real_data[column1], real_data[column2]) + correlation_synthetic = correlation_fn(synthetic_data[column1], synthetic_data[column2]) + return { + 'score': 1 - abs(correlation_real - correlation_synthetic) / 2, + 'real': correlation_real, + 'synthetic': correlation_synthetic, + } @classmethod - def compute_breakdown(cls, real_data, synthetic_data, coefficient='Pearson'): - """Compare the breakdown of correlation similarity of two continuous columns. + def compute(cls, real_data, synthetic_data, coefficient='Pearson'): + """Compare the correlation similarity of two continuous columns. Args: real_data (Union[numpy.ndarray, pandas.Series]): @@ -70,10 +77,10 @@ def compute_breakdown(cls, real_data, synthetic_data, coefficient='Pearson'): The values from the synthetic dataset. Returns: - dict: - A dict containing the score, and the real and synthetic metric values. + float: + The correlation similarity of the two columns. """ - return {'score': cls.compute(real_data, synthetic_data, coefficient)} + return cls.compute_breakdown(real_data, synthetic_data, coefficient)['score'] @classmethod def normalize(cls, raw_score): diff --git a/tests/unit/column_pairs/statistical/test_correlation_similarity.py b/tests/unit/column_pairs/statistical/test_correlation_similarity.py index 525e9a45..71a63d62 100644 --- a/tests/unit/column_pairs/statistical/test_correlation_similarity.py +++ b/tests/unit/column_pairs/statistical/test_correlation_similarity.py @@ -1,4 +1,4 @@ -from unittest.mock import Mock, patch +from unittest.mock import Mock, call, patch import pandas as pd @@ -9,8 +9,8 @@ class TestCorrelationSimilarity: @patch('sdmetrics.column_pairs.statistical.correlation_similarity.pearsonr') - def test_compute(self, pearson_mock): - """Test the ``compute`` method. + def test_compute_breakdown(self, pearson_mock): + """Test the ``compute_breakdown`` method. Expect that the selected coefficient is used to compare the real and synthetic data. @@ -18,28 +18,37 @@ def test_compute(self, pearson_mock): - Patch the ``scipy.stats.pearsonr`` method to return a test result. Input: - - Real data. - - Synthetic data. + - Mocked real data. + - Mocked synthetic data. Output: - - The evaluated metric. + - A mapping of the metric results, containing the score and the real and synthetic results. """ # Setup - real_data = pd.Series([1.0, 2.4, 2.6, 0.8]) - synthetic_data = pd.Series([0.9, 1.8, 3.1, 5.0]) - - metric = CorrelationSimilarity() + real_data = pd.DataFrame({'col1': [1.0, 2.4, 2.6, 0.8], 'col2': [1, 2, 3, 4]}) + synthetic_data = pd.DataFrame({'col1': [0.9, 1.8, 3.1, 5.0], 'col2': [2, 3, 4, 1]}) + score_real = -0.451 + score_synthetic = -0.003 + pearson_mock.side_effect = [score_real, score_synthetic] + expected_score_breakdown = { + 'score': 1 - abs(score_real - score_synthetic) / 2, + 'real': score_real, + 'synthetic': score_synthetic, + } # Run - result = metric.compute(real_data, synthetic_data, coefficient='Pearson') + metric = CorrelationSimilarity() + result = metric.compute_breakdown(real_data, synthetic_data, coefficient='Pearson') # Assert - pearson_mock.assert_called_once_with( - SeriesMatcher(real_data), SeriesMatcher(synthetic_data)) - assert result == pearson_mock.return_value + assert pearson_mock.has_calls( + call(SeriesMatcher(real_data['col1']), SeriesMatcher(real_data['col2'])), + call(SeriesMatcher(synthetic_data['col1']), SeriesMatcher(synthetic_data['col2'])), + ) + assert result == expected_score_breakdown - def test_compute_breakdown(self): - """Test the ``compute_breakdown`` method. + def test_compute(self): + """Test the ``compute`` method. Expect that the selected coefficient is used to compare the real and synthetic data. @@ -47,22 +56,27 @@ def test_compute_breakdown(self): - Mock the ``compute`` method to return a test score. Input: - - Mocked real data. - - Mocked synthetic data. + - Real data. + - Synthetic data. Output: - - A mapping of the metric results, containing the score and the real and synthetic results. + - The evaluated metric. """ # Setup test_score = 0.2 + score_breakdown = {'score': test_score} metric = CorrelationSimilarity() # Run - with patch.object(CorrelationSimilarity, 'compute', return_value=test_score): - result = metric.compute_breakdown(Mock(), Mock(), coefficient='Pearson') + with patch.object( + CorrelationSimilarity, + 'compute_breakdown', + return_value=score_breakdown, + ): + result = metric.compute(Mock(), Mock(), coefficient='Pearson') # Assert - assert result == {'score': test_score} + assert result == test_score @patch( 'sdmetrics.column_pairs.statistical.correlation_similarity.ColumnPairsMetric.normalize' From 5846cc64dcf7eeb7ba7bb553ef471bf01c7cd678 Mon Sep 17 00:00:00 2001 From: Katharine Xiao <2405771+katxiao@users.noreply.github.com> Date: Wed, 20 Jul 2022 14:48:18 -0400 Subject: [PATCH 5/5] Add unit test --- .../test_correlation_similarity.py | 46 +++++++++++++++++++ 1 file changed, 46 insertions(+) diff --git a/tests/unit/column_pairs/statistical/test_correlation_similarity.py b/tests/unit/column_pairs/statistical/test_correlation_similarity.py index 71a63d62..715767e2 100644 --- a/tests/unit/column_pairs/statistical/test_correlation_similarity.py +++ b/tests/unit/column_pairs/statistical/test_correlation_similarity.py @@ -1,3 +1,4 @@ +from datetime import datetime from unittest.mock import Mock, call, patch import pandas as pd @@ -47,6 +48,51 @@ def test_compute_breakdown(self, pearson_mock): ) assert result == expected_score_breakdown + @patch('sdmetrics.column_pairs.statistical.correlation_similarity.pearsonr') + def test_compute_breakdown_datetime(self, pearson_mock): + """Test the ``compute_breakdown`` method with datetime input. + + Expect that the selected coefficient is used to compare the real and synthetic data. + + Setup: + - Patch the ``scipy.stats.pearsonr`` method to return a test result. + + Input: + - Mocked real data. + - Mocked synthetic data. + + Output: + - A mapping of the metric results, containing the score and the real and synthetic results. + """ + # Setup + real_data = pd.DataFrame({ + 'col1': [datetime(2020, 1, 3), datetime(2020, 10, 13), datetime(2021, 5, 3)], + 'col2': [datetime(2021, 7, 23), datetime(2021, 8, 3), datetime(2020, 9, 24)], + }) + synthetic_data = pd.DataFrame({ + 'col1': [datetime(2021, 9, 19), datetime(2021, 10, 1), datetime(2020, 3, 1)], + 'col2': [datetime(2022, 4, 28), datetime(2021, 7, 31), datetime(2020, 4, 2)], + }) + score_real = 0.2 + score_synthetic = 0.1 + pearson_mock.side_effect = [score_real, score_synthetic] + expected_score_breakdown = { + 'score': 1 - abs(score_real - score_synthetic) / 2, + 'real': score_real, + 'synthetic': score_synthetic, + } + + # Run + metric = CorrelationSimilarity() + result = metric.compute_breakdown(real_data, synthetic_data, coefficient='Pearson') + + # Assert + assert pearson_mock.has_calls( + call(SeriesMatcher(real_data['col1']), SeriesMatcher(real_data['col2'])), + call(SeriesMatcher(synthetic_data['col1']), SeriesMatcher(synthetic_data['col2'])), + ) + assert result == expected_score_breakdown + def test_compute(self): """Test the ``compute`` method.