diff --git a/sdmetrics/column_pairs/base.py b/sdmetrics/column_pairs/base.py index e101f797..c1712524 100644 --- a/sdmetrics/column_pairs/base.py +++ b/sdmetrics/column_pairs/base.py @@ -35,7 +35,25 @@ def compute(real_data, synthetic_data): pandas.DataFrame with 2 columns. Returns: - Union[float, tuple[float]]: + float: Metric output. """ raise NotImplementedError() + + @classmethod + def compute_breakdown(cls, real_data, synthetic_data): + """Compute the breakdown of this metric. + + Args: + real_data (pandas.DataFrame): + The values from the real dataset, passed as pandas.DataFrame + with 2 columns. + synthetic_data (pandas.DataFrame): + The values from the synthetic dataset, passed as a + pandas.DataFrame with 2 columns. + + Returns: + dict + A mapping of the metric output. Must contain the key 'score'. + """ + return {'score': cls.compute(real_data, synthetic_data)} diff --git a/sdmetrics/column_pairs/statistical/__init__.py b/sdmetrics/column_pairs/statistical/__init__.py index e79ac684..c9175c95 100644 --- a/sdmetrics/column_pairs/statistical/__init__.py +++ b/sdmetrics/column_pairs/statistical/__init__.py @@ -1,11 +1,13 @@ """Statistical Metrics to compare column pairs.""" from sdmetrics.column_pairs.statistical.contingency_similarity import ContingencySimilarity +from sdmetrics.column_pairs.statistical.correlation_similarity import CorrelationSimilarity from sdmetrics.column_pairs.statistical.kl_divergence import ( ContinuousKLDivergence, DiscreteKLDivergence) __all__ = [ 'ContingencySimilarity', 'ContinuousKLDivergence', + 'CorrelationSimilarity', 'DiscreteKLDivergence', ] diff --git a/sdmetrics/column_pairs/statistical/correlation_similarity.py b/sdmetrics/column_pairs/statistical/correlation_similarity.py new file mode 100644 index 00000000..d6ef20a2 --- /dev/null +++ b/sdmetrics/column_pairs/statistical/correlation_similarity.py @@ -0,0 +1,97 @@ +"""Correlation Similarity Metric.""" + +import pandas as pd +from scipy.stats import pearsonr, spearmanr + +from sdmetrics.column_pairs.base import ColumnPairsMetric +from sdmetrics.goal import Goal +from sdmetrics.utils import is_datetime + + +class CorrelationSimilarity(ColumnPairsMetric): + """Correlation similarity metric. + + Attributes: + name (str): + Name to use when reports about this metric are printed. + goal (sdmetrics.goal.Goal): + The goal of this metric. + min_value (Union[float, tuple[float]]): + Minimum value or values that this metric can take. + max_value (Union[float, tuple[float]]): + Maximum value or values that this metric can take. + """ + + name = 'CorrelationSimilarity' + goal = Goal.MAXIMIZE + min_value = 0.0 + max_value = 1.0 + + @classmethod + def compute_breakdown(cls, real_data, synthetic_data, coefficient='Pearson'): + """Compare the breakdown of correlation similarity of two continuous columns. + + Args: + real_data (Union[numpy.ndarray, pandas.Series]): + The values from the real dataset. + synthetic_data (Union[numpy.ndarray, pandas.Series]): + The values from the synthetic dataset. + + Returns: + dict: + A dict containing the score, and the real and synthetic metric values. + """ + real_data[pd.isna(real_data)] = 0.0 + synthetic_data[pd.isna(synthetic_data)] = 0.0 + column1, column2 = real_data.columns[:2] + + if is_datetime(real_data): + real_data = pd.to_numeric(real_data) + synthetic_data = pd.to_numeric(synthetic_data) + + correlation_fn = None + if coefficient == 'Pearson': + correlation_fn = pearsonr + elif coefficient == 'Spearman': + correlation_fn = spearmanr + else: + raise ValueError(f'requested coefficient {coefficient} is not valid. ' + 'Please choose either Pearson or Spearman.') + + correlation_real = correlation_fn(real_data[column1], real_data[column2]) + correlation_synthetic = correlation_fn(synthetic_data[column1], synthetic_data[column2]) + return { + 'score': 1 - abs(correlation_real - correlation_synthetic) / 2, + 'real': correlation_real, + 'synthetic': correlation_synthetic, + } + + @classmethod + def compute(cls, real_data, synthetic_data, coefficient='Pearson'): + """Compare the correlation similarity of two continuous columns. + + Args: + real_data (Union[numpy.ndarray, pandas.Series]): + The values from the real dataset. + synthetic_data (Union[numpy.ndarray, pandas.Series]): + The values from the synthetic dataset. + + Returns: + float: + The correlation similarity of the two columns. + """ + return cls.compute_breakdown(real_data, synthetic_data, coefficient)['score'] + + @classmethod + def normalize(cls, raw_score): + """Return the `raw_score` as is, since it is already normalized. + + Args: + raw_score (float): + The value of the metric from `compute`. + + Returns: + float: + The normalized value of the metric + """ + return super().normalize(raw_score) diff --git a/sdmetrics/single_column/statistical/statistic_similarity.py b/sdmetrics/single_column/statistical/statistic_similarity.py index 678325fc..53450236 100644 --- a/sdmetrics/single_column/statistical/statistic_similarity.py +++ b/sdmetrics/single_column/statistical/statistic_similarity.py @@ -19,20 +19,15 @@ class StatisticSimilarity(SingleColumnMetric): Minimum value or values that this metric can take. max_value (Union[float, tuple[float]]): Maximum value or values that this metric can take. - statistic (str): - The statistic to compute the metric on (mean, std, or median). Defaults to mean. """ name = 'StatisticSimilarity' goal = Goal.MAXIMIZE min_value = 0.0 max_value = 1.0 - statistic = 'mean' - def __init__(self, statistic='mean'): - self.statistic = statistic - - def compute(self, real_data, synthetic_data): + @classmethod + def compute(cls, real_data, synthetic_data, statistic='mean'): """Compare the statistic similarity of two continuous columns. Args: @@ -45,9 +40,10 @@ def compute(self, real_data, synthetic_data): float: The statistical similarity of the two columns. """ - return self.compute_breakdown(real_data, synthetic_data)['score'] + return cls.compute_breakdown(real_data, synthetic_data, statistic)['score'] - def compute_breakdown(self, real_data, synthetic_data): + @staticmethod + def compute_breakdown(real_data, synthetic_data, statistic='mean'): """Compare the breakdown of statistic similarity of two continuous columns. Args: @@ -67,17 +63,17 @@ def compute_breakdown(self, real_data, synthetic_data): real_data = pd.to_numeric(real_data) synthetic_data = pd.to_numeric(synthetic_data) - if self.statistic == 'mean': + if statistic == 'mean': score_real = real_data.mean() score_synthetic = synthetic_data.mean() - elif self.statistic == 'std': + elif statistic == 'std': score_real = real_data.std() score_synthetic = synthetic_data.std() - elif self.statistic == 'median': + elif statistic == 'median': score_real = real_data.median() score_synthetic = synthetic_data.median() else: - raise ValueError(f'requested statistic {self.statistic} is not valid. ' + raise ValueError(f'requested statistic {statistic} is not valid. ' 'Please choose either mean, std, or median.') score = 1 - abs(score_real - score_synthetic) / (real_data.max() - real_data.min()) diff --git a/sdmetrics/single_table/multi_column_pairs.py b/sdmetrics/single_table/multi_column_pairs.py index f96a90fc..7443bf13 100644 --- a/sdmetrics/single_table/multi_column_pairs.py +++ b/sdmetrics/single_table/multi_column_pairs.py @@ -186,3 +186,28 @@ class ContingencySimilarity(MultiColumnPairsMetric): field_types = ('boolean', 'categorical') column_pairs_metric = column_pairs.statistical.contingency_similarity.ContingencySimilarity + + +class CorrelationSimilarity(MultiColumnPairsMetric): + """MultiColumnPairsMetric based on ColumnPairs CorrelationSimilarity. + + This computes the correlation between column pairs based on the specified coefficient, + which defaults to 'Pearson'. + + Attributes: + name (str): + Name to use when reports about this metric are printed. + goal (sdmetrics.goal.Goal): + The goal of this metric. + min_value (Union[float, tuple[float]]): + Minimum value or values that this metric can take. + max_value (Union[float, tuple[float]]): + Maximum value or values that this metric can take. + column_pairs_metric (sdmetrics.column_pairs.base.ColumnPairsMetric): + ColumnPairs DiscreteKLDivergence. + field_types (dict): + Field types to which the SingleColumn metric will be applied. + """ + + field_types = ('numerical', 'datetime') + column_pairs_metric = column_pairs.statistical.correlation_similarity.CorrelationSimilarity diff --git a/tests/unit/column_pairs/__init__.py b/tests/unit/column_pairs/__init__.py new file mode 100644 index 00000000..aac7ef87 --- /dev/null +++ b/tests/unit/column_pairs/__init__.py @@ -0,0 +1 @@ +"""Unit tests for the column pairs module.""" diff --git a/tests/unit/column_pairs/statistical/__init__.py b/tests/unit/column_pairs/statistical/__init__.py new file mode 100644 index 00000000..13be4859 --- /dev/null +++ b/tests/unit/column_pairs/statistical/__init__.py @@ -0,0 +1 @@ +"""Unit tests for the column pairs statistical metrics.""" diff --git a/tests/unit/column_pairs/statistical/test_correlation_similarity.py b/tests/unit/column_pairs/statistical/test_correlation_similarity.py new file mode 100644 index 00000000..715767e2 --- /dev/null +++ b/tests/unit/column_pairs/statistical/test_correlation_similarity.py @@ -0,0 +1,150 @@ +from datetime import datetime +from unittest.mock import Mock, call, patch + +import pandas as pd + +from sdmetrics.column_pairs.statistical import CorrelationSimilarity +from tests.utils import SeriesMatcher + + +class TestCorrelationSimilarity: + + @patch('sdmetrics.column_pairs.statistical.correlation_similarity.pearsonr') + def test_compute_breakdown(self, pearson_mock): + """Test the ``compute_breakdown`` method. + + Expect that the selected coefficient is used to compare the real and synthetic data. + + Setup: + - Patch the ``scipy.stats.pearsonr`` method to return a test result. + + Input: + - Mocked real data. + - Mocked synthetic data. + + Output: + - A mapping of the metric results, containing the score and the real and synthetic results. + """ + # Setup + real_data = pd.DataFrame({'col1': [1.0, 2.4, 2.6, 0.8], 'col2': [1, 2, 3, 4]}) + synthetic_data = pd.DataFrame({'col1': [0.9, 1.8, 3.1, 5.0], 'col2': [2, 3, 4, 1]}) + score_real = -0.451 + score_synthetic = -0.003 + pearson_mock.side_effect = [score_real, score_synthetic] + expected_score_breakdown = { + 'score': 1 - abs(score_real - score_synthetic) / 2, + 'real': score_real, + 'synthetic': score_synthetic, + } + + # Run + metric = CorrelationSimilarity() + result = metric.compute_breakdown(real_data, synthetic_data, coefficient='Pearson') + + # Assert + assert pearson_mock.has_calls( + call(SeriesMatcher(real_data['col1']), SeriesMatcher(real_data['col2'])), + call(SeriesMatcher(synthetic_data['col1']), SeriesMatcher(synthetic_data['col2'])), + ) + assert result == expected_score_breakdown + + @patch('sdmetrics.column_pairs.statistical.correlation_similarity.pearsonr') + def test_compute_breakdown_datetime(self, pearson_mock): + """Test the ``compute_breakdown`` method with datetime input. + + Expect that the selected coefficient is used to compare the real and synthetic data. + + Setup: + - Patch the ``scipy.stats.pearsonr`` method to return a test result. + + Input: + - Mocked real data. + - Mocked synthetic data. + + Output: + - A mapping of the metric results, containing the score and the real and synthetic results. + """ + # Setup + real_data = pd.DataFrame({ + 'col1': [datetime(2020, 1, 3), datetime(2020, 10, 13), datetime(2021, 5, 3)], + 'col2': [datetime(2021, 7, 23), datetime(2021, 8, 3), datetime(2020, 9, 24)], + }) + synthetic_data = pd.DataFrame({ + 'col1': [datetime(2021, 9, 19), datetime(2021, 10, 1), datetime(2020, 3, 1)], + 'col2': [datetime(2022, 4, 28), datetime(2021, 7, 31), datetime(2020, 4, 2)], + }) + score_real = 0.2 + score_synthetic = 0.1 + pearson_mock.side_effect = [score_real, score_synthetic] + expected_score_breakdown = { + 'score': 1 - abs(score_real - score_synthetic) / 2, + 'real': score_real, + 'synthetic': score_synthetic, + } + + # Run + metric = CorrelationSimilarity() + result = metric.compute_breakdown(real_data, synthetic_data, coefficient='Pearson') + + # Assert + assert pearson_mock.has_calls( + call(SeriesMatcher(real_data['col1']), SeriesMatcher(real_data['col2'])), + call(SeriesMatcher(synthetic_data['col1']), SeriesMatcher(synthetic_data['col2'])), + ) + assert result == expected_score_breakdown + + def test_compute(self): + """Test the ``compute`` method. + + Expect that the selected coefficient is used to compare the real and synthetic data. + + Setup: + - Mock the ``compute`` method to return a test score. + + Input: + - Real data. + - Synthetic data. + + Output: + - The evaluated metric. + """ + # Setup + test_score = 0.2 + score_breakdown = {'score': test_score} + metric = CorrelationSimilarity() + + # Run + with patch.object( + CorrelationSimilarity, + 'compute_breakdown', + return_value=score_breakdown, + ): + result = metric.compute(Mock(), Mock(), coefficient='Pearson') + + # Assert + assert result == test_score + + @patch( + 'sdmetrics.column_pairs.statistical.correlation_similarity.ColumnPairsMetric.normalize' + ) + def test_normalize(self, normalize_mock): + """Test the ``normalize`` method. + + Expect that the inherited ``normalize`` method is called. + + Input: + - Raw score + + Output: + - The output of the inherited ``normalize`` method. + """ + # Setup + metric = CorrelationSimilarity() + raw_score = 0.9 + + # Run + result = metric.normalize(raw_score) + + # Assert + normalize_mock.assert_called_once_with(raw_score) + assert result == normalize_mock.return_value diff --git a/tests/unit/column_pairs/test_base.py b/tests/unit/column_pairs/test_base.py new file mode 100644 index 00000000..d0d248aa --- /dev/null +++ b/tests/unit/column_pairs/test_base.py @@ -0,0 +1,32 @@ +from unittest.mock import Mock, patch + +from sdmetrics.column_pairs.base import ColumnPairsMetric + + +class TestColumnPairsMetric: + + def test_compute_breakdown(self): + """Test the ``compute_breakdown`` method. + + Expect a breakdown dictionary is returned that contains the score. + + Setup: + - Mock the ``compute`` method to return a fake score. + + Input: + - Real data. + - Synthetic data. + + Output: + - The evaluated metric. + """ + # Setup + metric = ColumnPairsMetric() + test_metric_score = 0.5 + + # Run + with patch.object(ColumnPairsMetric, 'compute', return_value=test_metric_score): + result = metric.compute_breakdown(Mock(), Mock()) + + # Assert + assert result == {'score': test_metric_score} diff --git a/tests/unit/single_column/__init__.py b/tests/unit/single_column/__init__.py new file mode 100644 index 00000000..051ac498 --- /dev/null +++ b/tests/unit/single_column/__init__.py @@ -0,0 +1 @@ +"""Unit tests for the single column module.""" diff --git a/tests/unit/single_column/statistical/__init__.py b/tests/unit/single_column/statistical/__init__.py new file mode 100644 index 00000000..9ba140e4 --- /dev/null +++ b/tests/unit/single_column/statistical/__init__.py @@ -0,0 +1 @@ +"""Unit tests for the statistical single column metrics.""" diff --git a/tests/unit/single_column/statistical/test_statistic_similarity.py b/tests/unit/single_column/statistical/test_statistic_similarity.py index 9c493291..a85d5368 100644 --- a/tests/unit/single_column/statistical/test_statistic_similarity.py +++ b/tests/unit/single_column/statistical/test_statistic_similarity.py @@ -26,10 +26,10 @@ def test_compute_breakdown(self): real_data = pd.Series([1.0, 2.4, 2.6, 0.8]) synthetic_data = pd.Series([0.9, 1.8, 3.1, 5.0]) - metric = StatisticSimilarity(statistic='mean') + metric = StatisticSimilarity() # Run - result = metric.compute_breakdown(real_data, synthetic_data) + result = metric.compute_breakdown(real_data, synthetic_data, statistic='mean') # Assert assert result == {'score': 1 - (2.7 - 1.7) / 1.8, 'real': 1.7, 'synthetic': 2.7} @@ -52,11 +52,11 @@ def test_compute(self): # Setup metric_breakdown = {'score': 0.56, 'real': 1.7, 'synthetic': 2.7} - metric = StatisticSimilarity(statistic='mean') + metric = StatisticSimilarity() # Run with patch.object(StatisticSimilarity, 'compute_breakdown', return_value=metric_breakdown): - result = metric.compute(Mock(), Mock()) + result = metric.compute(Mock(), Mock(), statistic='mean') # Assert assert result == 0.56 @@ -74,7 +74,7 @@ def test_normalize(self, normalize_mock): - The output of the inherited ``normalize`` method. """ # Setup - metric = StatisticSimilarity(statistic='mean') + metric = StatisticSimilarity() raw_score = 0.9 # Run diff --git a/tests/utils.py b/tests/utils.py new file mode 100644 index 00000000..37f6a71c --- /dev/null +++ b/tests/utils.py @@ -0,0 +1,28 @@ +"""Utils for testing.""" +import pandas as pd + + +class DataFrameMatcher: + """Match a given Pandas DataFrame in a mock function call.""" + + def __init__(self, df): + """Initialize the DataFrame.""" + self.df = df + + def __eq__(self, other): + """Assert equality using pandas testing module.""" + pd.testing.assert_frame_equal(self.df, other) + return True + + +class SeriesMatcher: + """Match a given Pandas Series in a mock function call.""" + + def __init__(self, data): + """Initialize the Series.""" + self.data = data + + def __eq__(self, other): + """Assert equality using pandas testing module.""" + pd.testing.assert_series_equal(self.data, other) + return True