Skip to content

Commit eb708e3

Browse files
authored
Merge pull request #393 from capitalone/develop
Release v0.16.4
2 parents 124e952 + 937dc72 commit eb708e3

File tree

11 files changed

+894
-136
lines changed

11 files changed

+894
-136
lines changed

datacompy/__init__.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@
1818
Then extended to carry that functionality over to Spark Dataframes.
1919
"""
2020

21-
__version__ = "0.16.3"
21+
__version__ = "0.16.4"
2222

2323
import platform
2424
from warnings import warn

datacompy/base.py

+4
Original file line numberDiff line numberDiff line change
@@ -158,6 +158,10 @@ def report(
158158
"""Return a string representation of a report."""
159159
pass
160160

161+
def only_join_columns(self) -> bool:
162+
"""Boolean on if the only columns are the join columns."""
163+
return set(self.join_columns) == set(self.df1.columns) == set(self.df2.columns)
164+
161165

162166
def temp_column_name(*dataframes) -> str:
163167
"""Get a temp column name that isn't included in columns of any dataframes.

datacompy/core.py

+73-29
Original file line numberDiff line numberDiff line change
@@ -340,14 +340,22 @@ def _intersect_compare(self, ignore_spaces: bool, ignore_case: bool) -> None:
340340
otherwise.
341341
"""
342342
LOG.debug("Comparing intersection")
343-
row_cnt = len(self.intersect_rows)
344343
for column in self.intersect_columns():
345344
if column in self.join_columns:
346-
match_cnt = row_cnt
347-
col_match = ""
345+
col_match = column + "_match"
346+
match_cnt = len(self.intersect_rows)
347+
if not self.only_join_columns():
348+
row_cnt = len(self.intersect_rows)
349+
else:
350+
row_cnt = (
351+
len(self.intersect_rows)
352+
+ len(self.df1_unq_rows)
353+
+ len(self.df2_unq_rows)
354+
)
348355
max_diff = 0.0
349356
null_diff = 0
350357
else:
358+
row_cnt = len(self.intersect_rows)
351359
col_1 = column + "_" + self.df1_name
352360
col_2 = column + "_" + self.df2_name
353361
col_match = column + "_match"
@@ -428,6 +436,8 @@ def count_matching_rows(self) -> int:
428436

429437
def intersect_rows_match(self) -> bool:
430438
"""Check whether the intersect rows all match."""
439+
if self.intersect_rows.empty:
440+
return False
431441
actual_length = self.intersect_rows.shape[0]
432442
return self.count_matching_rows() == actual_length
433443

@@ -470,7 +480,7 @@ def subset(self) -> bool:
470480

471481
def sample_mismatch(
472482
self, column: str, sample_count: int = 10, for_display: bool = False
473-
) -> pd.DataFrame:
483+
) -> pd.DataFrame | None:
474484
"""Return sample mismatches.
475485
476486
Gets a sub-dataframe which contains the identifying
@@ -492,27 +502,53 @@ def sample_mismatch(
492502
A sample of the intersection dataframe, containing only the
493503
"pertinent" columns, for rows that don't match on the provided
494504
column.
505+
506+
None
507+
When the column being requested is not an intersecting column between dataframes.
495508
"""
496-
row_cnt = self.intersect_rows.shape[0]
497-
col_match = self.intersect_rows[column + "_match"]
498-
match_cnt = col_match.sum()
499-
sample_count = min(sample_count, row_cnt - match_cnt)
500-
sample = self.intersect_rows[~col_match].sample(sample_count)
501-
return_cols = [
502-
*self.join_columns,
503-
column + "_" + self.df1_name,
504-
column + "_" + self.df2_name,
505-
]
506-
to_return = sample[return_cols]
507-
if for_display:
508-
to_return.columns = pd.Index(
509-
[
510-
*self.join_columns,
511-
column + " (" + self.df1_name + ")",
512-
column + " (" + self.df2_name + ")",
513-
]
509+
if not self.only_join_columns() and column not in self.join_columns:
510+
row_cnt = self.intersect_rows.shape[0]
511+
try:
512+
col_match = self.intersect_rows[column + "_match"]
513+
except KeyError:
514+
LOG.error(
515+
f"Column: {column} is not an intersecting column. No mismatches can be generated."
516+
)
517+
return None
518+
match_cnt = col_match.sum()
519+
sample_count = min(sample_count, row_cnt - match_cnt)
520+
sample = self.intersect_rows[~col_match].sample(sample_count)
521+
return_cols = [
522+
*self.join_columns,
523+
column + "_" + self.df1_name,
524+
column + "_" + self.df2_name,
525+
]
526+
to_return = sample[return_cols]
527+
if for_display:
528+
to_return.columns = pd.Index(
529+
[
530+
*self.join_columns,
531+
column + " (" + self.df1_name + ")",
532+
column + " (" + self.df2_name + ")",
533+
]
534+
)
535+
return to_return
536+
else:
537+
row_cnt = (
538+
len(self.intersect_rows)
539+
+ len(self.df1_unq_rows)
540+
+ len(self.df2_unq_rows)
514541
)
515-
return to_return
542+
col_match = self.intersect_rows[column]
543+
match_cnt = col_match.count()
544+
sample_count = min(sample_count, row_cnt - match_cnt)
545+
sample = pd.concat(
546+
[self.df1_unq_rows[[column]], self.df2_unq_rows[[column]]]
547+
).sample(sample_count)
548+
to_return = sample
549+
if for_display:
550+
to_return.columns = pd.Index([column])
551+
return to_return
516552

517553
def all_mismatch(self, ignore_matching_cols: bool = False) -> pd.DataFrame:
518554
"""Get all rows with any columns that have a mismatch.
@@ -532,6 +568,10 @@ def all_mismatch(self, ignore_matching_cols: bool = False) -> pd.DataFrame:
532568
"""
533569
match_list = []
534570
return_list = []
571+
if self.only_join_columns():
572+
LOG.info("Only join keys in data, returning mismatches based on unq_rows")
573+
return pd.concat([self.df1_unq_rows, self.df2_unq_rows])
574+
535575
for col in self.intersect_rows.columns:
536576
if col.endswith("_match"):
537577
orig_col_name = col[:-6]
@@ -560,6 +600,14 @@ def all_mismatch(self, ignore_matching_cols: bool = False) -> pd.DataFrame:
560600
LOG.debug(
561601
f"Column {orig_col_name} is equal in df1 and df2. It will not be added to the result."
562602
)
603+
if len(match_list) == 0:
604+
LOG.info("No match columns found, returning mismatches based on unq_rows")
605+
return pd.concat(
606+
[
607+
self.df1_unq_rows[self.join_columns],
608+
self.df2_unq_rows[self.join_columns],
609+
]
610+
)
563611

564612
mm_bool = self.intersect_rows[match_list].all(axis="columns")
565613
return self.intersect_rows[~mm_bool][self.join_columns + return_list]
@@ -851,13 +899,9 @@ def columns_equal(
851899
| (col_1.isnull() & col_2.isnull())
852900
)
853901
except Exception:
854-
# Check for string[pyarrow] and string[python]
855-
if col_1.dtype in (
856-
"string[python]",
857-
"string[pyarrow]",
858-
) and col_2.dtype in ("string[python]", "string[pyarrow]"):
902+
try:
859903
compare = pd.Series(col_1.astype(str) == col_2.astype(str))
860-
else: # Blanket exception should just return all False
904+
except Exception: # Blanket exception should just return all False
861905
compare = pd.Series(False, index=col_1.index)
862906
compare.index = col_1.index
863907
return compare

datacompy/polars.py

+73-33
Original file line numberDiff line numberDiff line change
@@ -83,8 +83,8 @@ class PolarsCompare(BaseCompare):
8383

8484
def __init__(
8585
self,
86-
df1: "pl.DataFrame",
87-
df2: "pl.DataFrame",
86+
df1: pl.DataFrame,
87+
df2: pl.DataFrame,
8888
join_columns: List[str] | str,
8989
abs_tol: float = 0,
9090
rel_tol: float = 0,
@@ -126,25 +126,25 @@ def __init__(
126126
self._compare(ignore_spaces=ignore_spaces, ignore_case=ignore_case)
127127

128128
@property
129-
def df1(self) -> "pl.DataFrame":
129+
def df1(self) -> pl.DataFrame:
130130
"""Get the first dataframe."""
131131
return self._df1
132132

133133
@df1.setter
134-
def df1(self, df1: "pl.DataFrame") -> None:
134+
def df1(self, df1: pl.DataFrame) -> None:
135135
"""Check that it is a dataframe and has the join columns."""
136136
self._df1 = df1
137137
self._validate_dataframe(
138138
"df1", cast_column_names_lower=self.cast_column_names_lower
139139
)
140140

141141
@property
142-
def df2(self) -> "pl.DataFrame":
142+
def df2(self) -> pl.DataFrame:
143143
"""Get the second dataframe."""
144144
return self._df2
145145

146146
@df2.setter
147-
def df2(self, df2: "pl.DataFrame") -> None:
147+
def df2(self, df2: pl.DataFrame) -> None:
148148
"""Check that it is a dataframe and has the join columns."""
149149
self._df2 = df2
150150
self._validate_dataframe(
@@ -331,14 +331,22 @@ def _intersect_compare(self, ignore_spaces: bool, ignore_case: bool) -> None:
331331
null_diff: int | float
332332

333333
LOG.debug("Comparing intersection")
334-
row_cnt = len(self.intersect_rows)
335334
for column in self.intersect_columns():
336335
if column in self.join_columns:
337-
match_cnt = row_cnt
338-
col_match = ""
336+
col_match = column + "_match"
337+
match_cnt = len(self.intersect_rows)
338+
if not self.only_join_columns():
339+
row_cnt = len(self.intersect_rows)
340+
else:
341+
row_cnt = (
342+
len(self.intersect_rows)
343+
+ len(self.df1_unq_rows)
344+
+ len(self.df2_unq_rows)
345+
)
339346
max_diff = 0.0
340347
null_diff = 0
341348
else:
349+
row_cnt = len(self.intersect_rows)
342350
col_1 = column + "_" + self.df1_name
343351
col_2 = column + "_" + self.df2_name
344352
col_match = column + "_match"
@@ -429,6 +437,8 @@ def count_matching_rows(self) -> int:
429437

430438
def intersect_rows_match(self) -> bool:
431439
"""Check whether the intersect rows all match."""
440+
if self.intersect_rows.is_empty():
441+
return False
432442
actual_length = self.intersect_rows.shape[0]
433443
return self.count_matching_rows() == actual_length
434444

@@ -471,7 +481,7 @@ def subset(self) -> bool:
471481

472482
def sample_mismatch(
473483
self, column: str, sample_count: int = 10, for_display: bool = False
474-
) -> "pl.DataFrame":
484+
) -> pl.DataFrame | None:
475485
"""Return sample mismatches.
476486
477487
Get a sub-dataframe which contains the identifying
@@ -493,29 +503,46 @@ def sample_mismatch(
493503
A sample of the intersection dataframe, containing only the
494504
"pertinent" columns, for rows that don't match on the provided
495505
column.
506+
507+
None
508+
When the column being requested is not an intersecting column between dataframes.
496509
"""
497-
row_cnt = self.intersect_rows.shape[0]
498-
col_match = self.intersect_rows[column + "_match"]
499-
match_cnt = col_match.sum()
500-
sample_count = min(sample_count, row_cnt - match_cnt) # type: ignore
501-
sample = self.intersect_rows.filter(
502-
pl.col(column + "_match") != True # noqa: E712
503-
).sample(sample_count)
504-
return_cols = [
505-
*self.join_columns,
506-
column + "_" + self.df1_name,
507-
column + "_" + self.df2_name,
508-
]
509-
to_return = sample[return_cols]
510-
if for_display:
511-
to_return.columns = [
510+
if not self.only_join_columns() and column not in self.join_columns:
511+
row_cnt = self.intersect_rows.shape[0]
512+
col_match = self.intersect_rows[column + "_match"]
513+
match_cnt = col_match.sum()
514+
sample_count = min(sample_count, row_cnt - match_cnt) # type: ignore
515+
sample = self.intersect_rows.filter(
516+
pl.col(column + "_match") != True # noqa: E712
517+
).sample(sample_count)
518+
return_cols = [
512519
*self.join_columns,
513-
column + " (" + self.df1_name + ")",
514-
column + " (" + self.df2_name + ")",
520+
column + "_" + self.df1_name,
521+
column + "_" + self.df2_name,
515522
]
516-
return to_return
517-
518-
def all_mismatch(self, ignore_matching_cols: bool = False) -> "pl.DataFrame":
523+
to_return = sample[return_cols]
524+
if for_display:
525+
to_return.columns = [
526+
*self.join_columns,
527+
column + " (" + self.df1_name + ")",
528+
column + " (" + self.df2_name + ")",
529+
]
530+
return to_return
531+
else:
532+
row_cnt = (
533+
len(self.intersect_rows)
534+
+ len(self.df1_unq_rows)
535+
+ len(self.df2_unq_rows)
536+
)
537+
col_match = self.intersect_rows[column]
538+
match_cnt = col_match.count()
539+
sample_count = min(sample_count, row_cnt - match_cnt)
540+
sample = pl.concat(
541+
[self.df1_unq_rows[[column]], self.df2_unq_rows[[column]]]
542+
).sample(sample_count)
543+
return sample
544+
545+
def all_mismatch(self, ignore_matching_cols: bool = False) -> pl.DataFrame:
519546
"""Get all rows with any columns that have a mismatch.
520547
521548
Returns all df1 and df2 versions of the columns and join
@@ -533,6 +560,10 @@ def all_mismatch(self, ignore_matching_cols: bool = False) -> "pl.DataFrame":
533560
"""
534561
match_list = []
535562
return_list = []
563+
if self.only_join_columns():
564+
LOG.info("Only join keys in data, returning mismatches based on unq_rows")
565+
return pl.concat([self.df1_unq_rows, self.df2_unq_rows])
566+
536567
for col in self.intersect_rows.columns:
537568
if col.endswith("_match"):
538569
orig_col_name = col[:-6]
@@ -561,6 +592,15 @@ def all_mismatch(self, ignore_matching_cols: bool = False) -> "pl.DataFrame":
561592
LOG.debug(
562593
f"Column {orig_col_name} is equal in df1 and df2. It will not be added to the result."
563594
)
595+
if len(match_list) == 0:
596+
LOG.info("No match columns found, returning mismatches based on unq_rows")
597+
return pl.concat(
598+
[
599+
self.df1_unq_rows.select(self.join_columns),
600+
self.df2_unq_rows.select(self.join_columns),
601+
]
602+
)
603+
564604
return (
565605
self.intersect_rows.with_columns(__all=pl.all_horizontal(match_list))
566606
.filter(pl.col("__all") != True) # noqa: E712
@@ -595,7 +635,7 @@ def report(
595635
The report, formatted kinda nicely.
596636
"""
597637

598-
def df_to_str(pdf: "pl.DataFrame") -> str:
638+
def df_to_str(pdf: pl.DataFrame) -> str:
599639
return pdf.to_pandas().to_string()
600640

601641
# Header
@@ -887,7 +927,7 @@ def compare_string_and_date_columns(
887927

888928

889929
def get_merged_columns(
890-
original_df: "pl.DataFrame", merged_df: "pl.DataFrame", suffix: str
930+
original_df: pl.DataFrame, merged_df: pl.DataFrame, suffix: str
891931
) -> List[str]:
892932
"""Get the columns from an original dataframe, in the new merged dataframe.
893933
@@ -936,7 +976,7 @@ def calculate_max_diff(col_1: "pl.Series", col_2: "pl.Series") -> float:
936976

937977

938978
def generate_id_within_group(
939-
dataframe: "pl.DataFrame", join_columns: List[str]
979+
dataframe: pl.DataFrame, join_columns: List[str]
940980
) -> "pl.Series":
941981
"""Generate an ID column that can be used to deduplicate identical rows.
942982

0 commit comments

Comments
 (0)