intersection of rows as the datasets have no mutual key/connection - Spark/SF (#388)

rhaffar · web-flow · commit 5efe92a77975 · 2025-03-12T17:16:53.000-04:00
* spark full join

* snowflake full join

* order import to fix spark actions tests

* use pandas to compare Spark and SQL test outputs

* replace isEmpty for version compatibility

* sample mismatch column check
diff --git a/datacompy/core.py b/datacompy/core.py
@@ -343,16 +343,15 @@ def _intersect_compare(self, ignore_spaces: bool, ignore_case: bool) -> None:
         for column in self.intersect_columns():
             if column in self.join_columns:
                 col_match = column + "_match"
+                match_cnt = len(self.intersect_rows)
                 if not self.only_join_columns():
                     row_cnt = len(self.intersect_rows)
-                    match_cnt = len(self.intersect_rows[column])
                 else:
                     row_cnt = (
                         len(self.intersect_rows)
                         + len(self.df1_unq_rows)
                         + len(self.df2_unq_rows)
                     )
-                    match_cnt = len(self.intersect_rows[column])
                 max_diff = 0.0
                 null_diff = 0
             else:
diff --git a/datacompy/polars.py b/datacompy/polars.py
@@ -334,16 +334,15 @@ def _intersect_compare(self, ignore_spaces: bool, ignore_case: bool) -> None:
         for column in self.intersect_columns():
             if column in self.join_columns:
                 col_match = column + "_match"
+                match_cnt = len(self.intersect_rows)
                 if not self.only_join_columns():
                     row_cnt = len(self.intersect_rows)
-                    match_cnt = len(self.intersect_rows[column])
                 else:
                     row_cnt = (
                         len(self.intersect_rows)
                         + len(self.df1_unq_rows)
                         + len(self.df2_unq_rows)
                     )
-                    match_cnt = len(self.intersect_rows[column])
                 max_diff = 0.0
                 null_diff = 0
             else:
diff --git a/datacompy/snowflake.py b/datacompy/snowflake.py
@@ -39,6 +39,7 @@
     import snowflake.snowpark as sp
     from snowflake.connector.errors import DatabaseError, ProgrammingError
     from snowflake.snowpark import Window
+    from snowflake.snowpark.exceptions import SnowparkSQLException
     from snowflake.snowpark.functions import (
         abs,
         col,
@@ -425,32 +426,38 @@ def _intersect_compare(self, ignore_spaces: bool) -> None:
                     self.abs_tol,
                     ignore_spaces,
                 )
-        row_cnt = self.intersect_rows.count()
 
         with ThreadPoolExecutor() as executor:
             futures = []
             for column in self.intersect_columns():
-                future = executor.submit(
-                    self._calculate_column_compare_stats, column, row_cnt
-                )
+                future = executor.submit(self._calculate_column_compare_stats, column)
                 futures.append(future)
             for future in as_completed(futures):
                 if future.exception():
                     raise future.exception()
 
-    def _calculate_column_compare_stats(self, column: str, row_cnt: int) -> None:
+    def _calculate_column_compare_stats(self, column: str) -> None:
         """Populate the column stats for all intersecting column pairs.
 
         Calculates compare stats by intersecting column pairs. For the non-trivial case
         where intersecting columns are not join columns, a match count, max difference,
         and null difference must be calculated.
         """
         if column in self.join_columns:
-            match_cnt = row_cnt
-            col_match = ""
+            col_match = column + "_MATCH"
+            match_cnt = self.intersect_rows.count()
+            if not self.only_join_columns():
+                row_cnt = self.intersect_rows.count()
+            else:
+                row_cnt = (
+                    self.intersect_rows.count()
+                    + self.df1_unq_rows.count()
+                    + self.df2_unq_rows.count()
+                )
             max_diff = 0
             null_diff = 0
         else:
+            row_cnt = self.intersect_rows.count()
             col_1 = column + "_" + self.df1_name
             col_2 = column + "_" + self.df2_name
             col_match = column + "_MATCH"
@@ -551,6 +558,8 @@ def count_matching_rows(self) -> int:
 
     def intersect_rows_match(self) -> bool:
         """Check whether the intersect rows all match."""
+        if self.intersect_rows.count() == 0:
+            return False
         actual_length = self.intersect_rows.count()
         return self.count_matching_rows() == actual_length
 
@@ -616,37 +625,62 @@ def sample_mismatch(
             "pertinent" columns, for rows that don't match on the provided
             column.
         """
-        row_cnt = self.intersect_rows.count()
-        col_match = self.intersect_rows.select(column + "_MATCH")
-        match_cnt = col_match.where(
-            col(column + "_MATCH") == True  # noqa: E712
-        ).count()
-        sample_count = min(sample_count, row_cnt - match_cnt)
-        sample = (
-            self.intersect_rows.where(col(column + "_MATCH") == False)  # noqa: E712
-            .drop(column + "_MATCH")
-            .limit(sample_count)
-        )
+        column = column.upper()
+        if not self.only_join_columns() and column not in self.join_columns:
+            row_cnt = self.intersect_rows.count()
+            col_match = self.intersect_rows.select(column + "_MATCH")
+            try:
+                col_match.collect()
+            except SnowparkSQLException:
+                LOG.error(
+                    f"Column: {column} is not an intersecting column. No mismatches can be generated."
+                )
+                return None
+            match_cnt = col_match.where(
+                col(column + "_MATCH") == True  # noqa: E712
+            ).count()
+            sample_count = min(sample_count, row_cnt - match_cnt)
+            sample = (
+                self.intersect_rows.where(col(column + "_MATCH") == False)  # noqa: E712
+                .drop(column + "_MATCH")
+                .limit(sample_count)
+            )
 
-        for c in self.join_columns:
-            sample = sample.withColumnRenamed(c + "_" + self.df1_name, c)
-
-        return_cols = [
-            *self.join_columns,
-            column + "_" + self.df1_name,
-            column + "_" + self.df2_name,
-        ]
-        to_return = sample.select(return_cols)
-
-        if for_display:
-            return to_return.toDF(
-                *[
-                    *self.join_columns,
-                    column + " (" + self.df1_name + ")",
-                    column + " (" + self.df2_name + ")",
-                ]
+            for c in self.join_columns:
+                sample = sample.withColumnRenamed(c + "_" + self.df1_name, c)
+
+            return_cols = [
+                *self.join_columns,
+                column + "_" + self.df1_name,
+                column + "_" + self.df2_name,
+            ]
+            to_return = sample.select(return_cols)
+
+            if for_display:
+                return to_return.toDF(
+                    *[
+                        *self.join_columns,
+                        column + " (" + self.df1_name + ")",
+                        column + " (" + self.df2_name + ")",
+                    ]
+                )
+            return to_return
+        else:
+            row_cnt = (
+                self.intersect_rows.count()
+                + self.df1_unq_rows.count()
+                + self.df2_unq_rows.count()
             )
-        return to_return
+            match_cnt = self.intersect_rows.count()
+            sample_count = min(sample_count, row_cnt - match_cnt)
+            df1_col = column + "_" + self.df1_name
+            df2_col = column + "_" + self.df2_name
+            sample = (
+                self.df1_unq_rows[[df1_col]]
+                .union_all(self.df2_unq_rows[[df2_col]])
+                .limit(sample_count)
+            )
+            return sample.toDF(column)
 
     def all_mismatch(self, ignore_matching_cols: bool = False) -> "sp.DataFrame":
         """Get all rows with any columns that have a mismatch.
@@ -666,6 +700,16 @@ def all_mismatch(self, ignore_matching_cols: bool = False) -> "sp.DataFrame":
         """
         match_list = []
         return_list = []
+        if self.only_join_columns():
+            LOG.info("Only join keys in data, returning mismatches based on unq_rows")
+            df1_cols = [f"{cols}_{self.df1_name}" for cols in self.join_columns]
+            df2_cols = [f"{cols}_{self.df2_name}" for cols in self.join_columns]
+            to_return = self.df1_unq_rows[df1_cols].union_all(
+                self.df2_unq_rows[df2_cols]
+            )
+            for c in self.join_columns:
+                to_return = to_return.withColumnRenamed(c + "_" + self.df1_name, c)
+            return to_return
         for c in self.intersect_rows.columns:
             if c.endswith("_MATCH"):
                 orig_col_name = c[:-6]
@@ -699,7 +743,16 @@ def all_mismatch(self, ignore_matching_cols: bool = False) -> "sp.DataFrame":
                     LOG.debug(
                         f"Column {orig_col_name} is equal in df1 and df2. It will not be added to the result."
                     )
-
+        if len(match_list) == 0:
+            LOG.info("No match columns found, returning mismatches based on unq_rows")
+            df1_cols = [f"{cols}_{self.df1_name}" for cols in self.join_columns]
+            df2_cols = [f"{cols}_{self.df2_name}" for cols in self.join_columns]
+            to_return = self.df1_unq_rows[df1_cols].union_all(
+                self.df2_unq_rows[df2_cols]
+            )
+            for c in self.join_columns:
+                to_return = to_return.withColumnRenamed(c + "_" + self.df1_name, c)
+            return to_return
         mm_rows = self.intersect_rows.withColumn(
             "match_array", concat(*match_list)
         ).where(contains(col("match_array"), lit("false")))
diff --git a/datacompy/spark/sql.py b/datacompy/spark/sql.py
@@ -459,14 +459,22 @@ def _intersect_compare(self, ignore_spaces: bool, ignore_case: bool) -> None:
         LOG.debug("Comparing intersection")
         max_diff: float
         null_diff: int
-        row_cnt = self.intersect_rows.count()
         for column in self.intersect_columns():
             if column in self.join_columns:
-                match_cnt = row_cnt
-                col_match = ""
+                col_match = column + "_match"
+                match_cnt = self.intersect_rows.count()
+                if not self.only_join_columns():
+                    row_cnt = self.intersect_rows.count()
+                else:
+                    row_cnt = (
+                        self.intersect_rows.count()
+                        + self.df1_unq_rows.count()
+                        + self.df2_unq_rows.count()
+                    )
                 max_diff = 0
                 null_diff = 0
             else:
+                row_cnt = self.intersect_rows.count()
                 col_1 = column + "_" + self.df1_name
                 col_2 = column + "_" + self.df2_name
                 col_match = column + "_match"
@@ -561,6 +569,8 @@ def count_matching_rows(self) -> int:
 
     def intersect_rows_match(self) -> bool:
         """Check whether the intersect rows all match."""
+        if self.intersect_rows.count() == 0:
+            return False
         actual_length = self.intersect_rows.count()
         return self.count_matching_rows() == actual_length
 
@@ -621,37 +631,54 @@ def sample_mismatch(
             "pertinent" columns, for rows that don't match on the provided
             column.
         """
-        row_cnt = self.intersect_rows.count()
-        col_match = self.intersect_rows.select(column + "_match")
-        match_cnt = col_match.where(
-            col(column + "_match") == True  # noqa: E712
-        ).count()
-        sample_count = min(sample_count, row_cnt - match_cnt)
-        sample = (
-            self.intersect_rows.where(col(column + "_match") == False)  # noqa: E712
-            .drop(column + "_match")
-            .limit(sample_count)
-        )
-
-        for c in self.join_columns:
-            sample = sample.withColumnRenamed(c + "_" + self.df1_name, c)
+        if not self.only_join_columns() and column not in self.join_columns:
+            row_cnt = self.intersect_rows.count()
+            col_match = self.intersect_rows.select(column + "_match")
+            match_cnt = col_match.where(
+                col(column + "_match") == True  # noqa: E712
+            ).count()
+            sample_count = min(sample_count, row_cnt - match_cnt)
+            sample = (
+                self.intersect_rows.where(col(column + "_match") == False)  # noqa: E712
+                .drop(column + "_match")
+                .limit(sample_count)
+            )
 
-        return_cols = [
-            *self.join_columns,
-            column + "_" + self.df1_name,
-            column + "_" + self.df2_name,
-        ]
-        to_return = sample.select(return_cols)
+            for c in self.join_columns:
+                sample = sample.withColumnRenamed(c + "_" + self.df1_name, c)
 
-        if for_display:
-            return to_return.toDF(
-                *[
-                    *self.join_columns,
-                    column + " (" + self.df1_name + ")",
-                    column + " (" + self.df2_name + ")",
-                ]
+            return_cols = [
+                *self.join_columns,
+                column + "_" + self.df1_name,
+                column + "_" + self.df2_name,
+            ]
+            to_return = sample.select(return_cols)
+
+            if for_display:
+                return to_return.toDF(
+                    *[
+                        *self.join_columns,
+                        column + " (" + self.df1_name + ")",
+                        column + " (" + self.df2_name + ")",
+                    ]
+                )
+            return to_return
+        else:
+            row_cnt = (
+                self.intersect_rows.count()
+                + self.df1_unq_rows.count()
+                + self.df2_unq_rows.count()
+            )
+            match_cnt = self.intersect_rows.count()
+            sample_count = min(sample_count, row_cnt - match_cnt)
+            df1_col = column + "_" + self.df1_name
+            df2_col = column + "_" + self.df2_name
+            sample = (
+                self.df1_unq_rows[[df1_col]]
+                .union(self.df2_unq_rows[[df2_col]])
+                .limit(sample_count)
             )
-        return to_return
+            return sample.toDF(column)
 
     def all_mismatch(
         self, ignore_matching_cols: bool = False
@@ -673,6 +700,14 @@ def all_mismatch(
         """
         match_list = []
         return_list = []
+        if self.only_join_columns():
+            LOG.info("Only join keys in data, returning mismatches based on unq_rows")
+            df1_cols = [f"{cols}_{self.df1_name}" for cols in self.join_columns]
+            df2_cols = [f"{cols}_{self.df2_name}" for cols in self.join_columns]
+            to_return = self.df1_unq_rows[df1_cols].union(self.df2_unq_rows[df2_cols])
+            for c in self.join_columns:
+                to_return = to_return.withColumnRenamed(c + "_" + self.df1_name, c)
+            return to_return
         for c in self.intersect_rows.columns:
             if c.endswith("_match"):
                 orig_col_name = c[:-6]
@@ -707,6 +742,14 @@ def all_mismatch(
                     LOG.debug(
                         f"Column {orig_col_name} is equal in df1 and df2. It will not be added to the result."
                     )
+        if len(match_list) == 0:
+            LOG.info("No match columns found, returning mismatches based on unq_rows")
+            df1_cols = [f"{cols}_{self.df1_name}" for cols in self.join_columns]
+            df2_cols = [f"{cols}_{self.df2_name}" for cols in self.join_columns]
+            to_return = self.df1_unq_rows[df1_cols].union(self.df2_unq_rows[df2_cols])
+            for c in self.join_columns:
+                to_return = to_return.withColumnRenamed(c + "_" + self.df1_name, c)
+            return to_return
 
         mm_rows = self.intersect_rows.withColumn(
             "match_array", array(match_list)
diff --git a/tests/test_snowflake.py b/tests/test_snowflake.py
diff --git a/tests/test_spark/test_sql_spark.py b/tests/test_spark/test_sql_spark.py