Skip to content

Commit 3064f9b

Browse files
[Feature] Composite Keys (#863)
1 parent 7b5c44d commit 3064f9b

22 files changed

Lines changed: 682 additions & 129 deletions

File tree

README.md

Lines changed: 2 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -91,10 +91,7 @@ my_report = QualityReport.load(filepath='demo_data_quality_report.pkl')
9191
# set by the real data
9292
from sdmetrics.single_column import BoundaryAdherence
9393

94-
BoundaryAdherence.compute(
95-
real_data['start_date'],
96-
synthetic_data['start_date']
97-
)
94+
BoundaryAdherence.compute(real_data['start_date'], synthetic_data['start_date'])
9895
```
9996
```
10097
0.8503937007874016
@@ -104,11 +101,7 @@ BoundaryAdherence.compute(
104101
# calculate whether the synthetic data is new or whether it's an exact copy of the real data
105102
from sdmetrics.single_table import NewRowSynthesis
106103

107-
NewRowSynthesis.compute(
108-
real_data,
109-
synthetic_data,
110-
metadata
111-
)
104+
NewRowSynthesis.compute(real_data, synthetic_data, metadata)
112105
```
113106
```
114107
1.0

sdmetrics/column_pairs/statistical/cardinality_boundary_adherence.py

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -33,10 +33,10 @@ def compute_breakdown(real_data, synthetic_data):
3333
"""Calculate the percentage of synthetic parents with cardinality in the correct range.
3434
3535
Args:
36-
real_data (tuple(pd.Series, pd.Series)):
37-
A tuple with the real primary key Series as the first element and real
38-
foreign keys Series as the second element.
39-
synthetic_data (tuple(pd.Series, pd.Series)):
36+
real_data (tuple(pd.DataFrame, pd.DataFrame) or tuple(pd.Series, pd.Series)):
37+
A tuple with the real primary key DataFrame as the first element and real
38+
foreign keys DataFrame as the second element.
39+
synthetic_data (tuple(pd.DataFrame, pd.DataFrame) or tuple(pd.Series, pd.Series)):
4040
A tuple with the synthetic primary key as the first element and synthetic
4141
foreign keys as the second element.
4242
@@ -45,10 +45,10 @@ def compute_breakdown(real_data, synthetic_data):
4545
Metric output.
4646
"""
4747
real_cardinality = pd.DataFrame(index=real_data[0].copy())
48-
real_cardinality['cardinality'] = real_data[1].value_counts()
48+
real_cardinality['cardinality'] = real_data[1].value_counts(dropna=False)
4949
real_cardinality = real_cardinality.fillna(0)
5050
synthetic_cardinality = pd.DataFrame(index=synthetic_data[0].copy())
51-
synthetic_cardinality['cardinality'] = synthetic_data[1].value_counts()
51+
synthetic_cardinality['cardinality'] = synthetic_data[1].value_counts(dropna=False)
5252
synthetic_cardinality = synthetic_cardinality.fillna(0)
5353

5454
min_cardinality = real_cardinality['cardinality'].min()
@@ -66,10 +66,10 @@ def compute(cls, real_data, synthetic_data):
6666
"""Calculate the percentage of synthetic parents with cardinality in the correct range.
6767
6868
Args:
69-
real_data (tuple(pd.Series, pd.Series)):
70-
A tuple with the real primary key Series as the first element and real
71-
foreign keys Series as the second element.
72-
synthetic_data (tuple(pd.Series, pd.Series)):
69+
real_data (tuple(pd.DataFrame, pd.DataFrame) or tuple(pd.Series, pd.Series)):
70+
A tuple with the real primary key DataFrame as the first element and real
71+
foreign keys DataFrame as the second element.
72+
synthetic_data (tuple(pd.DataFrame, pd.DataFrame) or tuple(pd.Series, pd.Series)):
7373
A tuple with the synthetic primary key as the first element and synthetic
7474
foreign keys as the second element.
7575

sdmetrics/column_pairs/statistical/contingency_similarity.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -117,7 +117,9 @@ def compute_breakdown(
117117
contingency_synthetic = synthetic.groupby(list(columns), dropna=False).size() / len(
118118
synthetic
119119
)
120-
combined_index = contingency_real.index.union(contingency_synthetic.index, sort=False)
120+
combined_index = contingency_real.index.union(
121+
contingency_synthetic.index, sort=False
122+
).drop_duplicates()
121123
contingency_synthetic = contingency_synthetic.reindex(combined_index, fill_value=0)
122124
contingency_real = contingency_real.reindex(combined_index, fill_value=0)
123125
diff = abs(contingency_real - contingency_synthetic).fillna(0)

sdmetrics/column_pairs/statistical/referential_integrity.py

Lines changed: 39 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -31,41 +31,70 @@ class ReferentialIntegrity(ColumnPairsMetric):
3131
goal = Goal.MAXIMIZE
3232
min_value = 0.0
3333
max_value = 1.0
34+
INDICATOR_NAME = '__ri_indicator__'
35+
36+
@staticmethod
37+
def _create_unique_name(name, list_names):
38+
"""Modify the ``name`` parameter if it already exists in the list of names."""
39+
result = name
40+
while result in list_names:
41+
result += '_'
42+
43+
return result
3444

3545
@classmethod
3646
def compute_breakdown(cls, real_data, synthetic_data):
3747
"""Compute the score breakdown of the referential integrity metric.
3848
3949
Args:
40-
real_data (tuple of 2 pandas.Series):
50+
real_data (tuple of 2 pandas.DataFrame):
4151
(primary_key, foreign_key) columns from the real data.
42-
synthetic_data (tuple of 2 pandas.Series):
52+
synthetic_data (tuple of 2 pandas.DataFrame):
4353
(primary_key, foreign_key) columns from the synthetic data.
4454
4555
Returns:
4656
dict:
4757
The score breakdown of the key uniqueness metric.
4858
"""
49-
if pd.isna(real_data[1]).any():
50-
synthetic_data = list(synthetic_data)
51-
synthetic_data[1] = synthetic_data[1].dropna()
52-
53-
missing_parents = not real_data[1].isin(real_data[0]).all()
59+
real_pk_df, real_fk_df = real_data
60+
synth_pk_df, synth_fk_df = synthetic_data
61+
pk_columns = list(real_pk_df.columns)
62+
fk_columns = list(real_fk_df.columns)
63+
indicator_name = cls._create_unique_name(cls.INDICATOR_NAME, pk_columns + fk_columns)
64+
65+
real_merged = real_fk_df.merge(
66+
real_pk_df.drop_duplicates(),
67+
how='left',
68+
left_on=fk_columns,
69+
right_on=pk_columns,
70+
indicator=indicator_name,
71+
)
72+
missing_parents = (real_merged[indicator_name] == 'left_only').any()
5473
if missing_parents:
5574
LOGGER.info("The real data has foreign keys that don't reference any primary key.")
5675

57-
score = synthetic_data[1].isin(synthetic_data[0]).mean()
76+
if len(fk_columns) == 1 and pd.isna(real_fk_df[fk_columns[0]]).any():
77+
synth_fk_df = synth_fk_df.dropna()
78+
79+
synth_merged = synth_fk_df.merge(
80+
synth_pk_df.drop_duplicates(),
81+
how='left',
82+
left_on=fk_columns,
83+
right_on=pk_columns,
84+
indicator=indicator_name,
85+
)
5886

87+
score = (synth_merged[indicator_name] == 'both').mean()
5988
return {'score': score}
6089

6190
@classmethod
6291
def compute(cls, real_data, synthetic_data):
6392
"""Compute the referential integrity of two columns.
6493
6594
Args:
66-
real_data (tuple of 2 pandas.Series):
95+
real_data (tuple of 2 pandas.DataFrame):
6796
(primary_key, foreign_key) columns from the real data.
68-
synthetic_data (tuple of 2 pandas.Series):
97+
synthetic_data (tuple of 2 pandas.DataFrame):
6998
(primary_key, foreign_key) columns from the synthetic data.
7099
71100
Returns:

sdmetrics/multi_table/README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -91,6 +91,6 @@ For example, we could execute the same metric as before by passing the `metadata
9191
of having to specify the individual `foreign_keys`:
9292

9393
```python
94-
In [10]: LogisticParentChildDetection.compute(real_data, synthetic_data, metadata)
94+
In[10]: LogisticParentChildDetection.compute(real_data, synthetic_data, metadata)
9595
Out[10]: 0.7569444444444444
9696
```

sdmetrics/reports/multi_table/_properties/inter_table_trends.py

Lines changed: 20 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
ColumnPairTrends as SingleTableColumnPairTrends,
1212
)
1313
from sdmetrics.reports.utils import PlotConfig
14+
from sdmetrics.utils import _cast_to_iterable
1415

1516

1617
class InterTableTrends(BaseMultiTableProperty):
@@ -50,16 +51,16 @@ def _denormalize_tables(self, real_data, synthetic_data, relationship):
5051
"""
5152
parent = relationship['parent_table_name']
5253
child = relationship['child_table_name']
53-
foreign_key = relationship['child_foreign_key']
54-
primary_key = relationship['parent_primary_key']
54+
foreign_key = _cast_to_iterable(relationship['child_foreign_key'])
55+
primary_key = _cast_to_iterable(relationship['parent_primary_key'])
5556

5657
real_parent = real_data[parent].add_prefix(f'{parent}.')
5758
real_child = real_data[child].add_prefix(f'{child}.')
5859
synthetic_parent = synthetic_data[parent].add_prefix(f'{parent}.')
5960
synthetic_child = synthetic_data[child].add_prefix(f'{child}.')
6061

61-
child_index = f'{child}.{foreign_key}'
62-
parent_index = f'{parent}.{primary_key}'
62+
child_index = [f'{child}.{key_col}' for key_col in foreign_key]
63+
parent_index = [f'{parent}.{key_col}' for key_col in primary_key]
6364

6465
denormalized_real = real_child.merge(
6566
real_parent, left_on=child_index, right_on=parent_index
@@ -101,7 +102,12 @@ def _merge_metadata(self, metadata, parent_table, child_table):
101102
merged_metadata['columns'] = {**child_cols, **parent_cols}
102103
if 'primary_key' in merged_metadata:
103104
primary_key = merged_metadata['primary_key']
104-
merged_metadata['primary_key'] = f'{child_table}.{primary_key}'
105+
if isinstance(primary_key, list):
106+
merged_metadata['primary_key'] = [
107+
f'{child_table}.{pk_col}' for pk_col in primary_key
108+
]
109+
else:
110+
merged_metadata['primary_key'] = f'{child_table}.{primary_key}'
105111

106112
return merged_metadata, list(parent_cols.keys()), list(child_cols.keys())
107113

@@ -123,6 +129,7 @@ def _generate_details(self, real_data, synthetic_data, metadata, progress_bar=No
123129
parent = relationship['parent_table_name']
124130
child = relationship['child_table_name']
125131
foreign_key = relationship['child_foreign_key']
132+
fk_tuple = tuple(foreign_key) if isinstance(foreign_key, list) else foreign_key
126133

127134
denormalized_real, denormalized_synthetic = self._denormalize_tables(
128135
real_data, synthetic_data, relationship
@@ -132,14 +139,14 @@ def _generate_details(self, real_data, synthetic_data, metadata, progress_bar=No
132139

133140
parent_child_pairs = itertools.product(parent_cols, child_cols)
134141

135-
self._properties[(parent, child, foreign_key)] = SingleTableColumnPairTrends()
142+
self._properties[(parent, child, fk_tuple)] = SingleTableColumnPairTrends()
136143
self._properties[
137-
(parent, child, foreign_key)
144+
(parent, child, fk_tuple)
138145
].real_correlation_threshold = self.real_correlation_threshold
139146
self._properties[
140-
(parent, child, foreign_key)
147+
(parent, child, fk_tuple)
141148
].real_association_threshold = self.real_association_threshold
142-
details = self._properties[(parent, child, foreign_key)]._generate_details(
149+
details = self._properties[(parent, child, fk_tuple)]._generate_details(
143150
denormalized_real,
144151
denormalized_synthetic,
145152
merged_metadata,
@@ -149,7 +156,7 @@ def _generate_details(self, real_data, synthetic_data, metadata, progress_bar=No
149156

150157
details['Parent Table'] = parent
151158
details['Child Table'] = child
152-
details['Foreign Key'] = foreign_key
159+
details['Foreign Key'] = str(foreign_key)
153160
if not details.empty:
154161
details['Column 1'] = details['Column 1'].str.replace(
155162
f'{parent}.', '', n=1, regex=False
@@ -233,18 +240,15 @@ def _compute_average_score(self, to_plot):
233240
def get_visualization(self, table_name=None):
234241
"""Create a plot to show the inter table trends data.
235242
236-
Returns:
237-
plotly.graph_objects._figure.Figure
238-
239243
Args:
240244
table_name (str, optional):
241245
Table to plot. Defaults to None.
242246
243-
Raises:
244-
- ``ValueError`` if property has not been computed.
245-
246247
Returns:
247248
plotly.graph_objects._figure.Figure
249+
250+
Raises:
251+
- ``ValueError`` if property has not been computed.
248252
"""
249253
if not self.is_computed:
250254
raise ValueError(

sdmetrics/reports/multi_table/base_multi_table_report.py

Lines changed: 27 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
import pandas as pd
44

55
from sdmetrics.reports.base_report import BaseReport
6+
from sdmetrics.utils import _cast_to_iterable
67
from sdmetrics.visualization import set_plotly_config
78

89

@@ -43,22 +44,36 @@ def _validate_data_format(self, real_data, synthetic_data):
4344
def _validate_relationships(self, real_data, synthetic_data, metadata):
4445
"""Validate that the relationships are valid."""
4546
for rel in metadata.get('relationships', []):
46-
parent_dtype = real_data[rel['parent_table_name']][rel['parent_primary_key']].dtype
47-
child_dtype = real_data[rel['child_table_name']][rel['child_foreign_key']].dtype
48-
if (parent_dtype == 'object' and child_dtype != 'object') or (
49-
parent_dtype != 'object' and child_dtype == 'object'
50-
):
51-
parent = rel['parent_table_name']
52-
parent_key = rel['parent_primary_key']
53-
child = rel['child_table_name']
54-
child_key = rel['child_foreign_key']
47+
parent = rel['parent_table_name']
48+
parent_key = rel['parent_primary_key']
49+
child = rel['child_table_name']
50+
child_key = rel['child_foreign_key']
51+
parent_key_str = f"'{parent_key}'" if isinstance(parent_key, str) else str(parent_key)
52+
child_key_str = f"'{child_key}'" if isinstance(child_key, str) else str(child_key)
53+
parent_primary_key = _cast_to_iterable(parent_key)
54+
child_foreign_key = _cast_to_iterable(child_key)
55+
56+
if len(parent_primary_key) != len(child_foreign_key):
5557
error_msg = (
5658
f"The '{parent}' table and '{child}' table cannot be merged "
57-
'for computing the cardinality. Please make sure the primary key'
58-
f" in '{parent}' ('{parent_key}') and the foreign key in '{child}'"
59-
f" ('{child_key}') have the same data type."
59+
'for computing the cardinality. Please make sure the number of columns '
60+
f'in the primary key ({parent_key_str}) matches the number of '
61+
f'columns in the foreign key ({child_key_str}).'
6062
)
6163
raise ValueError(error_msg)
64+
parent_dtypes = real_data[rel['parent_table_name']][parent_primary_key].dtypes
65+
child_dtypes = real_data[rel['child_table_name']][child_foreign_key].dtypes
66+
for parent_dtype, child_dtype in zip(parent_dtypes, child_dtypes):
67+
if (parent_dtype == 'object' and child_dtype != 'object') or (
68+
parent_dtype != 'object' and child_dtype == 'object'
69+
):
70+
error_msg = (
71+
f"The '{parent}' table and '{child}' table cannot be merged "
72+
'for computing the cardinality. Please make sure the primary key'
73+
f" in '{parent}' ({parent_key_str}) and the foreign key in '{child}'"
74+
f' ({child_key_str}) have the same data types.'
75+
)
76+
raise ValueError(error_msg)
6277

6378
def _validate_metadata_matches_data(self, real_data, synthetic_data, metadata):
6479
"""Validate that the metadata matches the data."""

sdmetrics/reports/single_table/_properties/data_validity.py

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -39,13 +39,19 @@ def _generate_details(self, real_data, synthetic_data, metadata, progress_bar=No
3939
The progress bar to use. Defaults to None.
4040
"""
4141
column_names, metric_names, scores = [], [], []
42+
column_sdtypes = [(col, metadata['columns'][col]['sdtype']) for col in metadata['columns']]
4243
error_messages = []
4344
primary_key = metadata.get('primary_key')
45+
if isinstance(primary_key, list):
46+
if len(primary_key) > 1:
47+
column_sdtypes = [(primary_key, None)] + column_sdtypes
48+
else:
49+
primary_key = primary_key[0]
50+
4451
alternate_keys = metadata.get('alternate_keys', [])
4552
sequence_index = metadata.get('sequence_index')
4653

47-
for column_name in metadata['columns']:
48-
sdtype = metadata['columns'][column_name]['sdtype']
54+
for column_name, sdtype in column_sdtypes:
4955
primary_key_match = column_name == primary_key
5056
alternate_key_match = column_name in alternate_keys
5157
is_unique = primary_key_match or alternate_key_match

0 commit comments

Comments
 (0)