intersection of rows as the datasets have no mutual key/connection (#385)

fdosani · web-flow · commit 3a7a9fa9fe81 · 2025-03-03T10:07:43.000-04:00
diff --git a/datacompy/__init__.py b/datacompy/__init__.py
@@ -18,7 +18,7 @@
 Then extended to carry that functionality over to Spark Dataframes.
 """
 
-__version__ = "0.16.3"
+__version__ = "0.16.4"
 
 import platform
 from warnings import warn
diff --git a/datacompy/base.py b/datacompy/base.py
@@ -158,6 +158,10 @@ def report(
         """Return a string representation of a report."""
         pass
 
+    def only_join_columns(self) -> bool:
+        """Boolean on if the only columns are the join columns."""
+        return set(self.join_columns) == set(self.df1.columns) == set(self.df2.columns)
+
 
 def temp_column_name(*dataframes) -> str:
     """Get a temp column name that isn't included in columns of any dataframes.
diff --git a/datacompy/core.py b/datacompy/core.py
@@ -340,14 +340,23 @@ def _intersect_compare(self, ignore_spaces: bool, ignore_case: bool) -> None:
         otherwise.
         """
         LOG.debug("Comparing intersection")
-        row_cnt = len(self.intersect_rows)
         for column in self.intersect_columns():
             if column in self.join_columns:
-                match_cnt = row_cnt
-                col_match = ""
+                col_match = column + "_match"
+                if not self.only_join_columns():
+                    row_cnt = len(self.intersect_rows)
+                    match_cnt = len(self.intersect_rows[column])
+                else:
+                    row_cnt = (
+                        len(self.intersect_rows)
+                        + len(self.df1_unq_rows)
+                        + len(self.df2_unq_rows)
+                    )
+                    match_cnt = len(self.intersect_rows[column])
                 max_diff = 0.0
                 null_diff = 0
             else:
+                row_cnt = len(self.intersect_rows)
                 col_1 = column + "_" + self.df1_name
                 col_2 = column + "_" + self.df2_name
                 col_match = column + "_match"
@@ -428,6 +437,8 @@ def count_matching_rows(self) -> int:
 
     def intersect_rows_match(self) -> bool:
         """Check whether the intersect rows all match."""
+        if self.intersect_rows.empty:
+            return False
         actual_length = self.intersect_rows.shape[0]
         return self.count_matching_rows() == actual_length
 
@@ -470,7 +481,7 @@ def subset(self) -> bool:
 
     def sample_mismatch(
         self, column: str, sample_count: int = 10, for_display: bool = False
-    ) -> pd.DataFrame:
+    ) -> pd.DataFrame | None:
         """Return sample mismatches.
 
         Gets a sub-dataframe which contains the identifying
@@ -492,27 +503,53 @@ def sample_mismatch(
             A sample of the intersection dataframe, containing only the
             "pertinent" columns, for rows that don't match on the provided
             column.
+
+        None
+            When the column being requested is not an intersecting column between dataframes.
         """
-        row_cnt = self.intersect_rows.shape[0]
-        col_match = self.intersect_rows[column + "_match"]
-        match_cnt = col_match.sum()
-        sample_count = min(sample_count, row_cnt - match_cnt)
-        sample = self.intersect_rows[~col_match].sample(sample_count)
-        return_cols = [
-            *self.join_columns,
-            column + "_" + self.df1_name,
-            column + "_" + self.df2_name,
-        ]
-        to_return = sample[return_cols]
-        if for_display:
-            to_return.columns = pd.Index(
-                [
-                    *self.join_columns,
-                    column + " (" + self.df1_name + ")",
-                    column + " (" + self.df2_name + ")",
-                ]
+        if not self.only_join_columns() and column not in self.join_columns:
+            row_cnt = self.intersect_rows.shape[0]
+            try:
+                col_match = self.intersect_rows[column + "_match"]
+            except KeyError:
+                LOG.error(
+                    f"Column: {column} is not an intersecting column. No mismatches can be generated."
+                )
+                return None
+            match_cnt = col_match.sum()
+            sample_count = min(sample_count, row_cnt - match_cnt)
+            sample = self.intersect_rows[~col_match].sample(sample_count)
+            return_cols = [
+                *self.join_columns,
+                column + "_" + self.df1_name,
+                column + "_" + self.df2_name,
+            ]
+            to_return = sample[return_cols]
+            if for_display:
+                to_return.columns = pd.Index(
+                    [
+                        *self.join_columns,
+                        column + " (" + self.df1_name + ")",
+                        column + " (" + self.df2_name + ")",
+                    ]
+                )
+            return to_return
+        else:
+            row_cnt = (
+                len(self.intersect_rows)
+                + len(self.df1_unq_rows)
+                + len(self.df2_unq_rows)
             )
-        return to_return
+            col_match = self.intersect_rows[column]
+            match_cnt = col_match.count()
+            sample_count = min(sample_count, row_cnt - match_cnt)
+            sample = pd.concat(
+                [self.df1_unq_rows[[column]], self.df2_unq_rows[[column]]]
+            ).sample(sample_count)
+            to_return = sample
+            if for_display:
+                to_return.columns = pd.Index([column])
+            return to_return
 
     def all_mismatch(self, ignore_matching_cols: bool = False) -> pd.DataFrame:
         """Get all rows with any columns that have a mismatch.
@@ -532,6 +569,10 @@ def all_mismatch(self, ignore_matching_cols: bool = False) -> pd.DataFrame:
         """
         match_list = []
         return_list = []
+        if self.only_join_columns():
+            LOG.info("Only join keys in data, returning mismatches based on unq_rows")
+            return pd.concat([self.df1_unq_rows, self.df2_unq_rows])
+
         for col in self.intersect_rows.columns:
             if col.endswith("_match"):
                 orig_col_name = col[:-6]
@@ -560,6 +601,14 @@ def all_mismatch(self, ignore_matching_cols: bool = False) -> pd.DataFrame:
                     LOG.debug(
                         f"Column {orig_col_name} is equal in df1 and df2. It will not be added to the result."
                     )
+        if len(match_list) == 0:
+            LOG.info("No match columns found, returning mismatches based on unq_rows")
+            return pd.concat(
+                [
+                    self.df1_unq_rows[self.join_columns],
+                    self.df2_unq_rows[self.join_columns],
+                ]
+            )
 
         mm_bool = self.intersect_rows[match_list].all(axis="columns")
         return self.intersect_rows[~mm_bool][self.join_columns + return_list]
diff --git a/datacompy/polars.py b/datacompy/polars.py
@@ -83,8 +83,8 @@ class PolarsCompare(BaseCompare):
 
     def __init__(
         self,
-        df1: "pl.DataFrame",
-        df2: "pl.DataFrame",
+        df1: pl.DataFrame,
+        df2: pl.DataFrame,
         join_columns: List[str] | str,
         abs_tol: float = 0,
         rel_tol: float = 0,
@@ -126,25 +126,25 @@ def __init__(
         self._compare(ignore_spaces=ignore_spaces, ignore_case=ignore_case)
 
     @property
-    def df1(self) -> "pl.DataFrame":
+    def df1(self) -> pl.DataFrame:
         """Get the first dataframe."""
         return self._df1
 
     @df1.setter
-    def df1(self, df1: "pl.DataFrame") -> None:
+    def df1(self, df1: pl.DataFrame) -> None:
         """Check that it is a dataframe and has the join columns."""
         self._df1 = df1
         self._validate_dataframe(
             "df1", cast_column_names_lower=self.cast_column_names_lower
         )
 
     @property
-    def df2(self) -> "pl.DataFrame":
+    def df2(self) -> pl.DataFrame:
         """Get the second dataframe."""
         return self._df2
 
     @df2.setter
-    def df2(self, df2: "pl.DataFrame") -> None:
+    def df2(self, df2: pl.DataFrame) -> None:
         """Check that it is a dataframe and has the join columns."""
         self._df2 = df2
         self._validate_dataframe(
@@ -331,14 +331,23 @@ def _intersect_compare(self, ignore_spaces: bool, ignore_case: bool) -> None:
         null_diff: int | float
 
         LOG.debug("Comparing intersection")
-        row_cnt = len(self.intersect_rows)
         for column in self.intersect_columns():
             if column in self.join_columns:
-                match_cnt = row_cnt
-                col_match = ""
+                col_match = column + "_match"
+                if not self.only_join_columns():
+                    row_cnt = len(self.intersect_rows)
+                    match_cnt = len(self.intersect_rows[column])
+                else:
+                    row_cnt = (
+                        len(self.intersect_rows)
+                        + len(self.df1_unq_rows)
+                        + len(self.df2_unq_rows)
+                    )
+                    match_cnt = len(self.intersect_rows[column])
                 max_diff = 0.0
                 null_diff = 0
             else:
+                row_cnt = len(self.intersect_rows)
                 col_1 = column + "_" + self.df1_name
                 col_2 = column + "_" + self.df2_name
                 col_match = column + "_match"
@@ -429,6 +438,8 @@ def count_matching_rows(self) -> int:
 
     def intersect_rows_match(self) -> bool:
         """Check whether the intersect rows all match."""
+        if self.intersect_rows.is_empty():
+            return False
         actual_length = self.intersect_rows.shape[0]
         return self.count_matching_rows() == actual_length
 
@@ -471,7 +482,7 @@ def subset(self) -> bool:
 
     def sample_mismatch(
         self, column: str, sample_count: int = 10, for_display: bool = False
-    ) -> "pl.DataFrame":
+    ) -> pl.DataFrame | None:
         """Return sample mismatches.
 
         Get a sub-dataframe which contains the identifying
@@ -493,29 +504,46 @@ def sample_mismatch(
             A sample of the intersection dataframe, containing only the
             "pertinent" columns, for rows that don't match on the provided
             column.
+
+        None
+            When the column being requested is not an intersecting column between dataframes.
         """
-        row_cnt = self.intersect_rows.shape[0]
-        col_match = self.intersect_rows[column + "_match"]
-        match_cnt = col_match.sum()
-        sample_count = min(sample_count, row_cnt - match_cnt)  # type: ignore
-        sample = self.intersect_rows.filter(
-            pl.col(column + "_match") != True  # noqa: E712
-        ).sample(sample_count)
-        return_cols = [
-            *self.join_columns,
-            column + "_" + self.df1_name,
-            column + "_" + self.df2_name,
-        ]
-        to_return = sample[return_cols]
-        if for_display:
-            to_return.columns = [
+        if not self.only_join_columns() and column not in self.join_columns:
+            row_cnt = self.intersect_rows.shape[0]
+            col_match = self.intersect_rows[column + "_match"]
+            match_cnt = col_match.sum()
+            sample_count = min(sample_count, row_cnt - match_cnt)  # type: ignore
+            sample = self.intersect_rows.filter(
+                pl.col(column + "_match") != True  # noqa: E712
+            ).sample(sample_count)
+            return_cols = [
                 *self.join_columns,
-                column + " (" + self.df1_name + ")",
-                column + " (" + self.df2_name + ")",
+                column + "_" + self.df1_name,
+                column + "_" + self.df2_name,
             ]
-        return to_return
-
-    def all_mismatch(self, ignore_matching_cols: bool = False) -> "pl.DataFrame":
+            to_return = sample[return_cols]
+            if for_display:
+                to_return.columns = [
+                    *self.join_columns,
+                    column + " (" + self.df1_name + ")",
+                    column + " (" + self.df2_name + ")",
+                ]
+            return to_return
+        else:
+            row_cnt = (
+                len(self.intersect_rows)
+                + len(self.df1_unq_rows)
+                + len(self.df2_unq_rows)
+            )
+            col_match = self.intersect_rows[column]
+            match_cnt = col_match.count()
+            sample_count = min(sample_count, row_cnt - match_cnt)
+            sample = pl.concat(
+                [self.df1_unq_rows[[column]], self.df2_unq_rows[[column]]]
+            ).sample(sample_count)
+            return sample
+
+    def all_mismatch(self, ignore_matching_cols: bool = False) -> pl.DataFrame:
         """Get all rows with any columns that have a mismatch.
 
         Returns all df1 and df2 versions of the columns and join
@@ -533,6 +561,10 @@ def all_mismatch(self, ignore_matching_cols: bool = False) -> "pl.DataFrame":
         """
         match_list = []
         return_list = []
+        if self.only_join_columns():
+            LOG.info("Only join keys in data, returning mismatches based on unq_rows")
+            return pl.concat([self.df1_unq_rows, self.df2_unq_rows])
+
         for col in self.intersect_rows.columns:
             if col.endswith("_match"):
                 orig_col_name = col[:-6]
@@ -561,6 +593,15 @@ def all_mismatch(self, ignore_matching_cols: bool = False) -> "pl.DataFrame":
                     LOG.debug(
                         f"Column {orig_col_name} is equal in df1 and df2. It will not be added to the result."
                     )
+        if len(match_list) == 0:
+            LOG.info("No match columns found, returning mismatches based on unq_rows")
+            return pl.concat(
+                [
+                    self.df1_unq_rows.select(self.join_columns),
+                    self.df2_unq_rows.select(self.join_columns),
+                ]
+            )
+
         return (
             self.intersect_rows.with_columns(__all=pl.all_horizontal(match_list))
             .filter(pl.col("__all") != True)  # noqa: E712
@@ -595,7 +636,7 @@ def report(
             The report, formatted kinda nicely.
         """
 
-        def df_to_str(pdf: "pl.DataFrame") -> str:
+        def df_to_str(pdf: pl.DataFrame) -> str:
             return pdf.to_pandas().to_string()
 
         # Header
@@ -887,7 +928,7 @@ def compare_string_and_date_columns(
 
 
 def get_merged_columns(
-    original_df: "pl.DataFrame", merged_df: "pl.DataFrame", suffix: str
+    original_df: pl.DataFrame, merged_df: pl.DataFrame, suffix: str
 ) -> List[str]:
     """Get the columns from an original dataframe, in the new merged dataframe.
 
@@ -936,7 +977,7 @@ def calculate_max_diff(col_1: "pl.Series", col_2: "pl.Series") -> float:
 
 
 def generate_id_within_group(
-    dataframe: "pl.DataFrame", join_columns: List[str]
+    dataframe: pl.DataFrame, join_columns: List[str]
 ) -> "pl.Series":
     """Generate an ID column that can be used to deduplicate identical rows.
 
diff --git a/pyproject.toml b/pyproject.toml
@@ -13,7 +13,7 @@ maintainers = [
   { name="Raymond Haffar", email="raymond.haffar@capitalone.com" },
 ]
 license = {text = "Apache Software License"}
-dependencies = ["pandas<=2.2.3,>=0.25.0", "numpy<=2.2.3,>=1.22.0", "ordered-set<=4.1.0,>=4.0.2", "polars[pandas]<=1.22.0,>=0.20.4"]
+dependencies = ["pandas<=2.2.3,>=0.25.0", "numpy<=2.2.3,>=1.22.0", "ordered-set<=4.1.0,>=4.0.2", "polars[pandas]<=1.23.0,>=0.20.4"]
 requires-python = ">=3.10.0"
 classifiers = [
     "Intended Audience :: Developers",
diff --git a/tests/test_core.py b/tests/test_core.py
diff --git a/tests/test_polars.py b/tests/test_polars.py

Original file line number	Diff line number	Diff line change
`@@ -13,7 +13,7 @@ maintainers = [`
`13`	`13`	`{ name="Raymond Haffar", email="[email protected]" },`
`14`	`14`	`]`
`15`	`15`	`license = {text = "Apache Software License"}`
`16`		`-dependencies = ["pandas<=2.2.3,>=0.25.0", "numpy<=2.2.3,>=1.22.0", "ordered-set<=4.1.0,>=4.0.2", "polars[pandas]<=1.22.0,>=0.20.4"]`
	`16`	`+dependencies = ["pandas<=2.2.3,>=0.25.0", "numpy<=2.2.3,>=1.22.0", "ordered-set<=4.1.0,>=4.0.2", "polars[pandas]<=1.23.0,>=0.20.4"]`
`17`	`17`	`requires-python = ">=3.10.0"`
`18`	`18`	`classifiers = [`
`19`	`19`	`"Intended Audience :: Developers",`