sdv-dev · katxiao · Jul 20, 2022 · Jul 14, 2022 · Jul 14, 2022 · Jul 14, 2022
diff --git a/sdmetrics/column_pairs/base.py b/sdmetrics/column_pairs/base.py
@@ -35,7 +35,25 @@ def compute(real_data, synthetic_data):
                 pandas.DataFrame with 2 columns.
 
         Returns:
-            Union[float, tuple[float]]:
+            float:
                 Metric output.
         """
         raise NotImplementedError()
+
+    @classmethod
+    def compute_breakdown(cls, real_data, synthetic_data):
+        """Compute the breakdown of this metric.
+
+        Args:
+            real_data (pandas.DataFrame):
+                The values from the real dataset, passed as pandas.DataFrame
+                with 2 columns.
+            synthetic_data (pandas.DataFrame):
+                The values from the synthetic dataset, passed as a
+                pandas.DataFrame with 2 columns.
+
+        Returns:
+            dict
+                A mapping of the metric output. Must contain the key 'score'.
+        """
+        return {'score': cls.compute(real_data, synthetic_data)}
diff --git a/sdmetrics/column_pairs/statistical/__init__.py b/sdmetrics/column_pairs/statistical/__init__.py
@@ -1,11 +1,13 @@
 """Statistical Metrics to compare column pairs."""
 
 from sdmetrics.column_pairs.statistical.contingency_similarity import ContingencySimilarity
+from sdmetrics.column_pairs.statistical.correlation_similarity import CorrelationSimilarity
 from sdmetrics.column_pairs.statistical.kl_divergence import (
     ContinuousKLDivergence, DiscreteKLDivergence)
 
 __all__ = [
     'ContingencySimilarity',
     'ContinuousKLDivergence',
+    'CorrelationSimilarity',
     'DiscreteKLDivergence',
 ]
diff --git a/sdmetrics/column_pairs/statistical/correlation_similarity.py b/sdmetrics/column_pairs/statistical/correlation_similarity.py
@@ -0,0 +1,97 @@
+"""Correlation Similarity Metric."""
+
+import pandas as pd
+from scipy.stats import pearsonr, spearmanr
+
+from sdmetrics.column_pairs.base import ColumnPairsMetric
+from sdmetrics.goal import Goal
+from sdmetrics.utils import is_datetime
+
+
+class CorrelationSimilarity(ColumnPairsMetric):
+    """Correlation similarity metric.
+
+    Attributes:
+        name (str):
+            Name to use when reports about this metric are printed.
+        goal (sdmetrics.goal.Goal):
+            The goal of this metric.
+        min_value (Union[float, tuple[float]]):
+            Minimum value or values that this metric can take.
+        max_value (Union[float, tuple[float]]):
+            Maximum value or values that this metric can take.
+    """
+
+    name = 'CorrelationSimilarity'
+    goal = Goal.MAXIMIZE
+    min_value = 0.0
+    max_value = 1.0
+
+    @classmethod
+    def compute_breakdown(cls, real_data, synthetic_data, coefficient='Pearson'):
+        """Compare the breakdown of correlation similarity of two continuous columns.
+
+        Args:
+            real_data (Union[numpy.ndarray, pandas.Series]):
+                The values from the real dataset.
+            synthetic_data (Union[numpy.ndarray, pandas.Series]):
+                The values from the synthetic dataset.
+
+        Returns:
+            dict:
+                A dict containing the score, and the real and synthetic metric values.
+        """
+        real_data[pd.isna(real_data)] = 0.0
+        synthetic_data[pd.isna(synthetic_data)] = 0.0
+        column1, column2 = real_data.columns[:2]
+
+        if is_datetime(real_data):
+            real_data = pd.to_numeric(real_data)
+            synthetic_data = pd.to_numeric(synthetic_data)
+
+        correlation_fn = None
+        if coefficient == 'Pearson':
+            correlation_fn = pearsonr
+        elif coefficient == 'Spearman':
+            correlation_fn = spearmanr
+        else:
+            raise ValueError(f'requested coefficient {coefficient} is not valid. '
+                             'Please choose either Pearson or Spearman.')
+
+        correlation_real = correlation_fn(real_data[column1], real_data[column2])
+        correlation_synthetic = correlation_fn(synthetic_data[column1], synthetic_data[column2])
+        return {
+            'score': 1 - abs(correlation_real - correlation_synthetic) / 2,
+            'real': correlation_real,
+            'synthetic': correlation_synthetic,
+        }
+
+    @classmethod
+    def compute(cls, real_data, synthetic_data, coefficient='Pearson'):
+        """Compare the correlation similarity of two continuous columns.
+
+        Args:
+            real_data (Union[numpy.ndarray, pandas.Series]):
+                The values from the real dataset.
+            synthetic_data (Union[numpy.ndarray, pandas.Series]):
+                The values from the synthetic dataset.
+
+        Returns:
+            float:
+                The correlation similarity of the two columns.
+        """
+        return cls.compute_breakdown(real_data, synthetic_data, coefficient)['score']
+
+    @classmethod
+    def normalize(cls, raw_score):
+        """Return the `raw_score` as is, since it is already normalized.
+
+        Args:
+            raw_score (float):
+                The value of the metric from `compute`.
+
+        Returns:
+            float:
+                The normalized value of the metric
+        """
+        return super().normalize(raw_score)
diff --git a/sdmetrics/single_column/statistical/statistic_similarity.py b/sdmetrics/single_column/statistical/statistic_similarity.py
@@ -19,20 +19,15 @@ class StatisticSimilarity(SingleColumnMetric):
             Minimum value or values that this metric can take.
         max_value (Union[float, tuple[float]]):
             Maximum value or values that this metric can take.
-        statistic (str):
-            The statistic to compute the metric on (mean, std, or median). Defaults to mean.
     """
 
     name = 'StatisticSimilarity'
     goal = Goal.MAXIMIZE
     min_value = 0.0
     max_value = 1.0
-    statistic = 'mean'
 
-    def __init__(self, statistic='mean'):
-        self.statistic = statistic
-
-    def compute(self, real_data, synthetic_data):
+    @classmethod
+    def compute(cls, real_data, synthetic_data, statistic='mean'):
         """Compare the statistic similarity of two continuous columns.
 
         Args:
@@ -45,9 +40,10 @@ def compute(self, real_data, synthetic_data):
             float:
                 The statistical similarity of the two columns.
         """
-        return self.compute_breakdown(real_data, synthetic_data)['score']
+        return cls.compute_breakdown(real_data, synthetic_data, statistic)['score']
 
-    def compute_breakdown(self, real_data, synthetic_data):
+    @staticmethod
+    def compute_breakdown(real_data, synthetic_data, statistic='mean'):
         """Compare the breakdown of statistic similarity of two continuous columns.
 
         Args:
@@ -67,17 +63,17 @@ def compute_breakdown(self, real_data, synthetic_data):
             real_data = pd.to_numeric(real_data)
             synthetic_data = pd.to_numeric(synthetic_data)
 
-        if self.statistic == 'mean':
+        if statistic == 'mean':
             score_real = real_data.mean()
             score_synthetic = synthetic_data.mean()
-        elif self.statistic == 'std':
+        elif statistic == 'std':
             score_real = real_data.std()
             score_synthetic = synthetic_data.std()
-        elif self.statistic == 'median':
+        elif statistic == 'median':
             score_real = real_data.median()
             score_synthetic = synthetic_data.median()
         else:
-            raise ValueError(f'requested statistic {self.statistic} is not valid. '
+            raise ValueError(f'requested statistic {statistic} is not valid. '
                              'Please choose either mean, std, or median.')
 
         score = 1 - abs(score_real - score_synthetic) / (real_data.max() - real_data.min())

diff --git a/sdmetrics/single_table/multi_column_pairs.py b/sdmetrics/single_table/multi_column_pairs.py
@@ -186,3 +186,28 @@ class ContingencySimilarity(MultiColumnPairsMetric):
 
     field_types = ('boolean', 'categorical')
     column_pairs_metric = column_pairs.statistical.contingency_similarity.ContingencySimilarity
+
+
+class CorrelationSimilarity(MultiColumnPairsMetric):
+    """MultiColumnPairsMetric based on ColumnPairs CorrelationSimilarity.
+
+    This computes the correlation between column pairs based on the specified coefficient,
+    which defaults to 'Pearson'.
+
+    Attributes:
+        name (str):
+            Name to use when reports about this metric are printed.
+        goal (sdmetrics.goal.Goal):
+            The goal of this metric.
+        min_value (Union[float, tuple[float]]):
+            Minimum value or values that this metric can take.
+        max_value (Union[float, tuple[float]]):
+            Maximum value or values that this metric can take.
+        column_pairs_metric (sdmetrics.column_pairs.base.ColumnPairsMetric):
+            ColumnPairs DiscreteKLDivergence.
+        field_types (dict):
+            Field types to which the SingleColumn metric will be applied.
+    """
+
+    field_types = ('numerical', 'datetime')
+    column_pairs_metric = column_pairs.statistical.correlation_similarity.CorrelationSimilarity
diff --git a/tests/unit/column_pairs/__init__.py b/tests/unit/column_pairs/__init__.py
@@ -0,0 +1 @@
+"""Unit tests for the column pairs module."""
diff --git a/tests/unit/column_pairs/statistical/__init__.py b/tests/unit/column_pairs/statistical/__init__.py
@@ -0,0 +1 @@
+"""Unit tests for the column pairs statistical metrics."""
diff --git a/tests/unit/column_pairs/statistical/test_correlation_similarity.py b/tests/unit/column_pairs/statistical/test_correlation_similarity.py
@@ -0,0 +1,150 @@
+from datetime import datetime
+from unittest.mock import Mock, call, patch
+
+import pandas as pd
+
+from sdmetrics.column_pairs.statistical import CorrelationSimilarity
+from tests.utils import SeriesMatcher
+
+
+class TestCorrelationSimilarity:
+
+    @patch('sdmetrics.column_pairs.statistical.correlation_similarity.pearsonr')
+    def test_compute_breakdown(self, pearson_mock):
+        """Test the ``compute_breakdown`` method.
+
+        Expect that the selected coefficient is used to compare the real and synthetic data.
+
+        Setup:
+        - Patch the ``scipy.stats.pearsonr`` method to return a test result.
+
+        Input:
+        - Mocked real data.
+        - Mocked synthetic data.
+
+        Output:
+        - A mapping of the metric results, containing the score and the real and synthetic results.
+        """
+        # Setup
+        real_data = pd.DataFrame({'col1': [1.0, 2.4, 2.6, 0.8], 'col2': [1, 2, 3, 4]})
+        synthetic_data = pd.DataFrame({'col1': [0.9, 1.8, 3.1, 5.0], 'col2': [2, 3, 4, 1]})
+        score_real = -0.451
+        score_synthetic = -0.003
+        pearson_mock.side_effect = [score_real, score_synthetic]
+        expected_score_breakdown = {
+            'score': 1 - abs(score_real - score_synthetic) / 2,
+            'real': score_real,
+            'synthetic': score_synthetic,
+        }
+
+        # Run
+        metric = CorrelationSimilarity()
+        result = metric.compute_breakdown(real_data, synthetic_data, coefficient='Pearson')
+
+        # Assert
+        assert pearson_mock.has_calls(
+            call(SeriesMatcher(real_data['col1']), SeriesMatcher(real_data['col2'])),
+            call(SeriesMatcher(synthetic_data['col1']), SeriesMatcher(synthetic_data['col2'])),
+        )
+        assert result == expected_score_breakdown
+
+    @patch('sdmetrics.column_pairs.statistical.correlation_similarity.pearsonr')
+    def test_compute_breakdown_datetime(self, pearson_mock):
+        """Test the ``compute_breakdown`` method with datetime input.
+
+        Expect that the selected coefficient is used to compare the real and synthetic data.
+
+        Setup:
+        - Patch the ``scipy.stats.pearsonr`` method to return a test result.
+
+        Input:
+        - Mocked real data.
+        - Mocked synthetic data.
+
+        Output:
+        - A mapping of the metric results, containing the score and the real and synthetic results.
+        """
+        # Setup
+        real_data = pd.DataFrame({
+            'col1': [datetime(2020, 1, 3), datetime(2020, 10, 13), datetime(2021, 5, 3)],
+            'col2': [datetime(2021, 7, 23), datetime(2021, 8, 3), datetime(2020, 9, 24)],
+        })
+        synthetic_data = pd.DataFrame({
+            'col1': [datetime(2021, 9, 19), datetime(2021, 10, 1), datetime(2020, 3, 1)],
+            'col2': [datetime(2022, 4, 28), datetime(2021, 7, 31), datetime(2020, 4, 2)],
+        })
+        score_real = 0.2
+        score_synthetic = 0.1
+        pearson_mock.side_effect = [score_real, score_synthetic]
+        expected_score_breakdown = {
+            'score': 1 - abs(score_real - score_synthetic) / 2,
+            'real': score_real,
+            'synthetic': score_synthetic,
+        }
+
+        # Run
+        metric = CorrelationSimilarity()
+        result = metric.compute_breakdown(real_data, synthetic_data, coefficient='Pearson')
+
+        # Assert
+        assert pearson_mock.has_calls(
+            call(SeriesMatcher(real_data['col1']), SeriesMatcher(real_data['col2'])),
+            call(SeriesMatcher(synthetic_data['col1']), SeriesMatcher(synthetic_data['col2'])),
+        )
+        assert result == expected_score_breakdown
+
+    def test_compute(self):
+        """Test the ``compute`` method.
+
+        Expect that the selected coefficient is used to compare the real and synthetic data.
+
+        Setup:
+        - Mock the ``compute`` method to return a test score.
+
+        Input:
+        - Real data.
+        - Synthetic data.
+
+        Output:
+        - The evaluated metric.
+        """
+        # Setup
+        test_score = 0.2
+        score_breakdown = {'score': test_score}
+        metric = CorrelationSimilarity()
+
+        # Run
+        with patch.object(
+            CorrelationSimilarity,
+            'compute_breakdown',
+            return_value=score_breakdown,
+        ):
+            result = metric.compute(Mock(), Mock(), coefficient='Pearson')
+
+        # Assert
+        assert result == test_score
+
+    @patch(
+        'sdmetrics.column_pairs.statistical.correlation_similarity.ColumnPairsMetric.normalize'
+    )
+    def test_normalize(self, normalize_mock):
+        """Test the ``normalize`` method.
+
+        Expect that the inherited ``normalize`` method is called.
+
+        Input:
+        - Raw score
+
+        Output:
+        - The output of the inherited ``normalize`` method.
+        """
+        # Setup
+        metric = CorrelationSimilarity()
+        raw_score = 0.9
+
+        # Run
+        result = metric.normalize(raw_score)
+
+        # Assert
+        normalize_mock.assert_called_once_with(raw_score)
+        assert result == normalize_mock.return_value
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		"""Unit tests for the column pairs statistical metrics."""