columns_equal clean up (#396)

fdosani · web-flow · commit fba6a24c731b · 2025-03-27T15:16:10.000-04:00
* refactor columns_equal, fixes #121 * refactor polars columns_equal, support for temporal * remove DATE_TYPE const
diff --git a/datacompy/core.py b/datacompy/core.py
@@ -859,53 +859,50 @@ def columns_equal(
     """
     default_value = "DATACOMPY_NULL"
     compare: pd.Series[bool]
+    if ignore_spaces:
+        if col_1.dtype.kind == "O" and pd.api.types.is_string_dtype(col_1):
+            col_1 = col_1.str.strip()
+        if col_2.dtype.kind == "O" and pd.api.types.is_string_dtype(col_2):
+            col_2 = col_2.str.strip()
+    if ignore_case:
+        if col_1.dtype.kind == "O" and pd.api.types.is_string_dtype(col_1):
+            col_1 = col_1.str.upper()
+        if col_2.dtype.kind == "O" and pd.api.types.is_string_dtype(col_2):
+            col_2 = col_2.str.upper()
 
     # short circuit if comparing mixed type columns. We don't want to support this moving forward.
     if pd.api.types.infer_dtype(col_1).startswith("mixed") or pd.api.types.infer_dtype(
         col_2
     ).startswith("mixed"):
         compare = pd.Series(False, index=col_1.index)
-        compare.index = col_1.index
-        return compare
-
-    try:
-        compare = pd.Series(
-            np.isclose(col_1, col_2, rtol=rel_tol, atol=abs_tol, equal_nan=True)
-        )
-    except TypeError:
+    elif pd.api.types.is_string_dtype(col_1) and pd.api.types.is_string_dtype(col_2):
         try:
             compare = pd.Series(
-                np.isclose(
-                    col_1.astype(float),
-                    col_2.astype(float),
-                    rtol=rel_tol,
-                    atol=abs_tol,
-                    equal_nan=True,
-                )
+                (col_1.fillna(default_value) == col_2.fillna(default_value))
+                | (col_1.isnull() & col_2.isnull())
+            )
+        except TypeError:
+            compare = pd.Series(col_1.astype(str) == col_2.astype(str))
+    elif {col_1.dtype.kind, col_2.dtype.kind} == {"M", "O"}:
+        compare = compare_string_and_date_columns(col_1, col_2)
+    else:
+        try:
+            compare = pd.Series(
+                np.isclose(col_1, col_2, rtol=rel_tol, atol=abs_tol, equal_nan=True)
             )
-        except (ValueError, TypeError):
+        except TypeError:
             try:
-                if ignore_spaces:
-                    if col_1.dtype.kind == "O" and pd.api.types.is_string_dtype(col_1):
-                        col_1 = col_1.str.strip()
-                    if col_2.dtype.kind == "O" and pd.api.types.is_string_dtype(col_2):
-                        col_2 = col_2.str.strip()
-
-                if ignore_case:
-                    if col_1.dtype.kind == "O" and pd.api.types.is_string_dtype(col_1):
-                        col_1 = col_1.str.upper()
-                    if col_2.dtype.kind == "O" and pd.api.types.is_string_dtype(col_2):
-                        col_2 = col_2.str.upper()
-
-                if {col_1.dtype.kind, col_2.dtype.kind} == {"M", "O"}:
-                    compare = compare_string_and_date_columns(col_1, col_2)
-                else:
-                    compare = pd.Series(
-                        (col_1.fillna(default_value) == col_2.fillna(default_value))
-                        | (col_1.isnull() & col_2.isnull())
+                compare = pd.Series(
+                    np.isclose(
+                        col_1.astype(float),
+                        col_2.astype(float),
+                        rtol=rel_tol,
+                        atol=abs_tol,
+                        equal_nan=True,
                     )
+                )
             except Exception:
-                try:
+                try:  # last check where we just cast to strings
                     compare = pd.Series(col_1.astype(str) == col_2.astype(str))
                 except Exception:  # Blanket exception should just return all False
                     compare = pd.Series(False, index=col_1.index)
diff --git a/datacompy/polars.py b/datacompy/polars.py
@@ -29,14 +29,12 @@
 import numpy as np
 import polars as pl
 from ordered_set import OrderedSet
-from polars.exceptions import ComputeError, InvalidOperationError
 
 from datacompy.base import BaseCompare, temp_column_name
 
 LOG = logging.getLogger(__name__)
 
 STRING_TYPE = ["String", "Utf8"]
-DATE_TYPE = ["Date", "Datetime"]
 
 
 class PolarsCompare(BaseCompare):
@@ -799,13 +797,13 @@ def render(filename: str, *fields: int | float | str) -> str:
 
 
 def columns_equal(
-    col_1: "pl.Series",
-    col_2: "pl.Series",
+    col_1: pl.Series,
+    col_2: pl.Series,
     rel_tol: float = 0,
     abs_tol: float = 0,
     ignore_spaces: bool = False,
     ignore_case: bool = False,
-) -> "pl.Series":
+) -> pl.Series:
     """Compare two columns from a dataframe.
 
     Returns a True/False series,
@@ -841,57 +839,54 @@ def columns_equal(
         values don't match.
     """
     compare: pl.Series
-    try:
+
+    if ignore_spaces:
+        if str(col_1.dtype) in STRING_TYPE:
+            col_1 = col_1.str.strip_chars()
+        if str(col_2.dtype) in STRING_TYPE:
+            col_2 = col_2.str.strip_chars()
+
+    if ignore_case:
+        if str(col_1.dtype) in STRING_TYPE:
+            col_1 = col_1.str.to_uppercase()
+        if str(col_2.dtype) in STRING_TYPE:
+            col_2 = col_2.str.to_uppercase()
+
+    if (str(col_1.dtype) in STRING_TYPE and str(col_2.dtype) in STRING_TYPE) or (
+        col_1.dtype.is_temporal() and col_2.dtype.is_temporal()
+    ):
         compare = pl.Series(
-            np.isclose(col_1, col_2, rtol=rel_tol, atol=abs_tol, equal_nan=True)
+            (col_1.eq_missing(col_2)) | (col_1.is_null() & col_2.is_null())
         )
-    except TypeError:
+    elif (str(col_1.dtype) in STRING_TYPE and str(col_2.dtype).startswith("Date")) or (
+        str(col_1.dtype).startswith("Date") and str(col_2.dtype) in STRING_TYPE
+    ):
+        compare = compare_string_and_date_columns(col_1, col_2)
+    else:
         try:
-            if col_1.dtype in DATE_TYPE or col_2 in DATE_TYPE:
-                raise TypeError("Found date, moving to alternative logic")
-
             compare = pl.Series(
-                np.isclose(
-                    col_1.cast(pl.Float64, strict=True),
-                    col_2.cast(pl.Float64, strict=True),
-                    rtol=rel_tol,
-                    atol=abs_tol,
-                    equal_nan=True,
-                )
+                np.isclose(col_1, col_2, rtol=rel_tol, atol=abs_tol, equal_nan=True)
             )
-        except (ValueError, TypeError, InvalidOperationError, ComputeError):
+        except TypeError:
             try:
-                if ignore_spaces:
-                    if str(col_1.dtype) in STRING_TYPE:
-                        col_1 = col_1.str.strip_chars()
-                    if str(col_2.dtype) in STRING_TYPE:
-                        col_2 = col_2.str.strip_chars()
-
-                if ignore_case:
-                    if str(col_1.dtype) in STRING_TYPE:
-                        col_1 = col_1.str.to_uppercase()
-                    if str(col_2.dtype) in STRING_TYPE:
-                        col_2 = col_2.str.to_uppercase()
-
-                if (
-                    str(col_1.dtype) in STRING_TYPE and str(col_2.dtype) in DATE_TYPE
-                ) or (
-                    str(col_1.dtype) in DATE_TYPE and str(col_2.dtype) in STRING_TYPE
-                ):
-                    compare = compare_string_and_date_columns(col_1, col_2)
-                else:
-                    compare = pl.Series(
-                        (col_1.eq_missing(col_2)) | (col_1.is_null() & col_2.is_null())
+                compare = pl.Series(
+                    np.isclose(
+                        col_1.cast(pl.Float64, strict=True),
+                        col_2.cast(pl.Float64, strict=True),
+                        rtol=rel_tol,
+                        atol=abs_tol,
+                        equal_nan=True,
                     )
+                )
             except Exception:
-                # Blanket exception should just return all False
-                compare = pl.Series(False * col_1.shape[0])
+                try:  # last check where we just cast to strings
+                    compare = pl.Series(col_1.cast(pl.String) == col_2.cast(pl.String))
+                except Exception:  # Blanket exception should just return all False
+                    compare = pl.Series(False * col_1.shape[0])
     return compare
 
 
-def compare_string_and_date_columns(
-    col_1: "pl.Series", col_2: "pl.Series"
-) -> "pl.Series":
+def compare_string_and_date_columns(col_1: pl.Series, col_2: pl.Series) -> pl.Series:
     """Compare a string column and date column, value-wise.
 
     This tries to
@@ -919,7 +914,7 @@ def compare_string_and_date_columns(
 
     try:  # datetime is inferred
         return pl.Series(
-            (str_column.str.to_datetime().eq_missing(date_column))
+            (str_column.str.to_datetime(strict=False).eq_missing(date_column))
             | (str_column.is_null() & date_column.is_null())
         )
     except Exception:
@@ -952,7 +947,7 @@ def get_merged_columns(
     return columns
 
 
-def calculate_max_diff(col_1: "pl.Series", col_2: "pl.Series") -> float:
+def calculate_max_diff(col_1: pl.Series, col_2: pl.Series) -> float:
     """Get a maximum difference between two columns.
 
     Parameters
@@ -977,7 +972,7 @@ def calculate_max_diff(col_1: "pl.Series", col_2: "pl.Series") -> float:
 
 def generate_id_within_group(
     dataframe: pl.DataFrame, join_columns: List[str]
-) -> "pl.Series":
+) -> pl.Series:
     """Generate an ID column that can be used to deduplicate identical rows.
 
     The series generated
diff --git a/tests/test_core.py b/tests/test_core.py
@@ -1497,3 +1497,30 @@ def test_non_full_join_counts_some_matches():
             ]
         ),
     )
+
+
+def test_string_as_numeric():
+    df1 = pd.DataFrame({"ID": [1], "REFER_NR": ["9998700990704001708177961516923014"]})
+    df2 = pd.DataFrame({"ID": [1], "REFER_NR": ["9998700990704001708177961516923015"]})
+    actual_out = datacompy.columns_equal(df1.REFER_NR, df2.REFER_NR)
+    assert not actual_out.all()
+
+
+def test_single_date_columns_equal_to_string():
+    data = """a|b|expected
+2017-01-01|2017-01-01   |True
+2017-01-02  |2017-01-02|True
+2017-10-01  |2017-10-10   |False
+2017-01-01||False
+|2017-01-01|False
+||False"""
+    df = pd.read_csv(io.StringIO(data), sep="|", keep_default_na=False)
+
+    try:
+        df["a"] = pd.to_datetime(df["a"], format="mixed")
+    except ValueError:
+        df["a"] = pd.to_datetime(df["a"])
+
+    actual_out = datacompy.columns_equal(df.a, df.b, rel_tol=0.2, ignore_spaces=True)
+    expect_out = df["expected"]
+    assert_series_equal(expect_out, actual_out, check_names=False)
diff --git a/tests/test_polars.py b/tests/test_polars.py
@@ -259,8 +259,13 @@ def test_bad_date_columns():
     df = pl.DataFrame(
         [{"a": "2017-01-01", "b": "2017-01-01"}, {"a": "2017-01-01", "b": "2A17-01-01"}]
     )
-    df = df.with_columns(df["a"].str.to_date(exact=True).alias("a_dt"))
-    assert not columns_equal(df["a_dt"], df["b"]).any()
+    col_a = df["a"].str.to_date()
+    col_b = df["b"]
+    assert columns_equal(col_a, col_b).to_list() == [True, False]
+
+    col_a = df["a"]
+    col_b = df["b"].str.to_date(strict=False)
+    assert columns_equal(col_a, col_b).to_list() == [True, False]
 
 
 def test_rounded_date_columns():
@@ -1457,3 +1462,59 @@ def test_categorical_column():
     compare = PolarsCompare(df, df, join_columns=["idx"])
     assert compare.intersect_rows["foo_match"].all()
     assert compare.intersect_rows["bar_match"].all()
+
+
+def test_string_as_numeric():
+    df1 = pl.DataFrame({"ID": [1], "REFER_NR": ["9998700990704001708177961516923014"]})
+    df2 = pl.DataFrame({"ID": [1], "REFER_NR": ["9998700990704001708177961516923015"]})
+    actual_out = columns_equal(df1["REFER_NR"], df2["REFER_NR"])
+    assert not actual_out.all()
+
+
+def test_single_date_columns_equal_to_string():
+    data = """a|b|expected
+2017-01-01|2017-01-01   |True
+2017-01-02  |2017-01-02|True
+2017-10-01  |2017-10-10   |False
+2017-01-01||False
+|2017-01-01|False
+||True"""
+    df = pl.read_csv(
+        io.StringIO(data),
+        separator="|",
+        null_values=["NULL"],
+        missing_utf8_is_empty_string=True,
+    )
+    col_a = df["a"].str.strip_chars().str.to_date(strict=False)
+    col_b = df["b"]
+
+    actual_out = columns_equal(col_a, col_b, rel_tol=0.2, ignore_spaces=True)
+    expect_out = df["expected"]
+    assert_series_equal(expect_out, actual_out, check_names=False)
+
+
+def test_temporal_equal():
+    data = """a|b|expected
+2017-01-01|2017-01-01|True
+2017-01-02|2017-01-02|True
+2017-10-01|2017-10-10   |False
+2017-01-01||False
+|2017-01-01|False
+||True"""
+    df = pl.read_csv(
+        io.StringIO(data),
+        separator="|",
+        null_values=["NULL"],
+        missing_utf8_is_empty_string=True,
+    )
+    expect_out = df["expected"]
+
+    col_a = df["a"].str.to_date(strict=False)
+    col_b = df["b"].str.to_date(strict=False)
+    actual_out = columns_equal(col_a, col_b)
+    assert_series_equal(expect_out, actual_out, check_names=False)
+
+    col_a = df["a"].str.to_datetime(strict=False)
+    col_b = df["b"].str.to_datetime(strict=False)
+    actual_out = columns_equal(col_a, col_b)
+    assert_series_equal(expect_out, actual_out, check_names=False)