From 1315c7c2067c6c21a52b9343bcad67f3470533ae Mon Sep 17 00:00:00 2001 From: Devin Petersohn Date: Mon, 17 Nov 2025 10:37:06 -0600 Subject: [PATCH 1/4] [SPARK-46166][Pyspark] Implementation of pandas.DataFrame.any with axis=1 Signed-off-by: Devin Petersohn --- python/pyspark/pandas/frame.py | 61 +++++++++++++------ .../pandas/tests/computation/test_any_all.py | 26 ++++++-- 2 files changed, 65 insertions(+), 22 deletions(-) diff --git a/python/pyspark/pandas/frame.py b/python/pyspark/pandas/frame.py index af89d18a0ede..c3034882dcbe 100644 --- a/python/pyspark/pandas/frame.py +++ b/python/pyspark/pandas/frame.py @@ -11132,7 +11132,7 @@ def all( # TODO(SPARK-46166): axis and **kwargs should be implemented. def any( self, axis: Axis = 0, bool_only: Optional[bool] = None, skipna: bool = True - ) -> "Series": + ) -> Union["Series", bool]: """ Return whether any element is True. @@ -11195,29 +11195,54 @@ def any( >>> df[[]].any() Series([], dtype: bool) """ - axis = validate_axis(axis) - if axis != 0: - raise NotImplementedError('axis should be either 0 or "index" currently.') - column_labels = self._internal.column_labels if bool_only: column_labels = self._bool_column_labels(column_labels) if len(column_labels) == 0: return ps.Series([], dtype=bool) + if axis == 0: + applied: List[PySparkColumn] = [] + for label in column_labels: + scol = self._internal.spark_column_for(label) + if skipna: + # When skipna=True, nulls count as False + any_col = F.max(scol.cast("boolean")) + applied.append(F.when(any_col.isNull(), False).otherwise(any_col)) + else: + # When skipna=False, nulls count as True + any_col = F.max(scol.cast("boolean")) + applied.append(F.when(any_col.isNull(), True).otherwise(any_col)) + return self._result_aggregated(column_labels, applied) + elif axis == 1: + from pyspark.pandas.series import first_series - applied: List[PySparkColumn] = [] - for label in column_labels: - scol = self._internal.spark_column_for(label) - if skipna: - # When skipna=True, nulls count as False - any_col = F.max(scol.cast("boolean")) - applied.append(F.when(any_col.isNull(), False).otherwise(any_col)) - else: - # When skipna=False, nulls count as True - any_col = F.max(scol.cast("boolean")) - applied.append(F.when(any_col.isNull(), True).otherwise(any_col)) - - return self._result_aggregated(column_labels, applied) + sdf = self._internal.spark_frame.select( + *self._internal_frame.index_spark_columns, + F.greatest( + *[ + F.coalesce( + self._internal.spark_column_for(label).cast("boolean"), + # When skipna=True, nulls count as False and vice versa + F.lit(not skipna), + ) for label in column_labels + ], + F.lit(False), # Handle one-column DataFrame case + ).alias(SPARK_DEFAULT_SERIES_NAME) + ) + return first_series( + DataFrame( + InternalFrame( + spark_frame=sdf, + index_spark_columns=self._internal.index_spark_columns, + index_names=self._internal.index_names, + index_fields=self._internal.index_fields, + column_labels=[None], + ) + ) + ) + else: + # axis=None case - return single boolean value + return self.any(axis=1).any() def _bool_column_labels(self, column_labels: List[Label]) -> List[Label]: """ diff --git a/python/pyspark/pandas/tests/computation/test_any_all.py b/python/pyspark/pandas/tests/computation/test_any_all.py index c381c96ead0e..ad3122a84f07 100644 --- a/python/pyspark/pandas/tests/computation/test_any_all.py +++ b/python/pyspark/pandas/tests/computation/test_any_all.py @@ -135,6 +135,11 @@ def test_any(self): self.assert_eq(psdf.any(bool_only=True), pdf.any(bool_only=True)) self.assert_eq(psdf.any(bool_only=False), pdf.any(bool_only=False)) + # Test axis=1 + self.assert_eq(psdf.any(axis=1), pdf.any(axis=1)) + self.assert_eq(psdf.any(axis=1, bool_only=True), pdf.any(axis=1, bool_only=True)) + self.assert_eq(psdf.any(axis=1, bool_only=False), pdf.any(axis=1, bool_only=False)) + columns.names = ["X", "Y"] pdf.columns = columns psdf.columns = columns @@ -143,10 +148,11 @@ def test_any(self): self.assert_eq(psdf.any(bool_only=True), pdf.any(bool_only=True)) self.assert_eq(psdf.any(bool_only=False), pdf.any(bool_only=False)) - with self.assertRaisesRegex( - NotImplementedError, 'axis should be either 0 or "index" currently.' - ): - psdf.any(axis=1) + # Test axis=1 + self.assert_eq(psdf.any(axis=1), pdf.any(axis=1)) + self.assert_eq(psdf.any(axis=1, bool_only=True), pdf.any(axis=1, bool_only=True)) + self.assert_eq(psdf.any(axis=1, bool_only=False), pdf.any(axis=1, bool_only=False)) + # Test skipna parameter pdf = pd.DataFrame( @@ -156,12 +162,14 @@ def test_any(self): # bools and np.nan self.assert_eq(psdf[["A", "B"]].any(skipna=False), pdf[["A", "B"]].any(skipna=False)) + self.assert_eq(psdf[["A", "B"]].any(axis=1, skipna=False), pdf[["A", "B"]].any(axis=1, skipna=False)) # bools and None self.assert_eq(psdf[["A", "C"]].any(skipna=False), pdf[["A", "C"]].any(skipna=False)) # bools, np.nan, and None self.assert_eq(psdf[["B", "C"]].any(skipna=False), pdf[["B", "C"]].any(skipna=False)) # np.nan, and None self.assert_eq(psdf[["D"]].any(skipna=False), pdf[["D"]].any(skipna=False)) + self.assert_eq(psdf[["D"]].any(axis=1, skipna=False), pdf[["D"]].any(axis=1, skipna=False)) # np.nan only self.assert_eq( @@ -169,6 +177,11 @@ def test_any(self): pd.DataFrame([np.nan]).any(skipna=False), almost=True, ) + self.assert_eq( + ps.DataFrame([np.nan]).any(axis=1, skipna=False), + pd.DataFrame([np.nan]).any(axis=1, skipna=False), + almost=True, + ) # None only self.assert_eq( @@ -176,6 +189,11 @@ def test_any(self): pd.DataFrame([None]).any(skipna=True), almost=True, ) + self.assert_eq( + ps.DataFrame([None]).any(axis=1, skipna=True), + pd.DataFrame([None]).any(axis=1, skipna=True), + almost=True, + ) class FrameAnyAllTests( From d5269a4713a79e354b0f9d716bb1faf369b8e73e Mon Sep 17 00:00:00 2001 From: Devin Petersohn Date: Mon, 17 Nov 2025 12:42:14 -0600 Subject: [PATCH 2/4] lint Signed-off-by: Devin Petersohn --- python/pyspark/pandas/frame.py | 5 +++-- python/pyspark/pandas/tests/computation/test_any_all.py | 5 +++-- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/python/pyspark/pandas/frame.py b/python/pyspark/pandas/frame.py index c3034882dcbe..348690b79c0f 100644 --- a/python/pyspark/pandas/frame.py +++ b/python/pyspark/pandas/frame.py @@ -11224,10 +11224,11 @@ def any( self._internal.spark_column_for(label).cast("boolean"), # When skipna=True, nulls count as False and vice versa F.lit(not skipna), - ) for label in column_labels + ) + for label in column_labels ], F.lit(False), # Handle one-column DataFrame case - ).alias(SPARK_DEFAULT_SERIES_NAME) + ).alias(SPARK_DEFAULT_SERIES_NAME), ) return first_series( DataFrame( diff --git a/python/pyspark/pandas/tests/computation/test_any_all.py b/python/pyspark/pandas/tests/computation/test_any_all.py index ad3122a84f07..87d09064b54b 100644 --- a/python/pyspark/pandas/tests/computation/test_any_all.py +++ b/python/pyspark/pandas/tests/computation/test_any_all.py @@ -153,7 +153,6 @@ def test_any(self): self.assert_eq(psdf.any(axis=1, bool_only=True), pdf.any(axis=1, bool_only=True)) self.assert_eq(psdf.any(axis=1, bool_only=False), pdf.any(axis=1, bool_only=False)) - # Test skipna parameter pdf = pd.DataFrame( {"A": [True, False], "B": [1, np.nan], "C": [True, None], "D": [None, np.nan]} @@ -162,7 +161,9 @@ def test_any(self): # bools and np.nan self.assert_eq(psdf[["A", "B"]].any(skipna=False), pdf[["A", "B"]].any(skipna=False)) - self.assert_eq(psdf[["A", "B"]].any(axis=1, skipna=False), pdf[["A", "B"]].any(axis=1, skipna=False)) + self.assert_eq( + psdf[["A", "B"]].any(axis=1, skipna=False), pdf[["A", "B"]].any(axis=1, skipna=False) + ) # bools and None self.assert_eq(psdf[["A", "C"]].any(skipna=False), pdf[["A", "C"]].any(skipna=False)) # bools, np.nan, and None From a2a50437bdd477138a5c3360f9146ec86fe90dd9 Mon Sep 17 00:00:00 2001 From: Devin Petersohn Date: Mon, 17 Nov 2025 14:40:24 -0600 Subject: [PATCH 3/4] Don't do axis=None in this PR Signed-off-by: Devin Petersohn --- python/pyspark/pandas/frame.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/pyspark/pandas/frame.py b/python/pyspark/pandas/frame.py index 348690b79c0f..3a8369efec86 100644 --- a/python/pyspark/pandas/frame.py +++ b/python/pyspark/pandas/frame.py @@ -11132,7 +11132,7 @@ def all( # TODO(SPARK-46166): axis and **kwargs should be implemented. def any( self, axis: Axis = 0, bool_only: Optional[bool] = None, skipna: bool = True - ) -> Union["Series", bool]: + ) -> "Series": """ Return whether any element is True. @@ -11243,7 +11243,7 @@ def any( ) else: # axis=None case - return single boolean value - return self.any(axis=1).any() + raise NotImplementedError('axis should be 0, 1, "index", or "columns" currently.') def _bool_column_labels(self, column_labels: List[Label]) -> List[Label]: """ From ea1c87faf8973d10074bbd56f1ffdd2c55cd5fb1 Mon Sep 17 00:00:00 2001 From: Devin Petersohn Date: Tue, 18 Nov 2025 07:21:27 -0600 Subject: [PATCH 4/4] Address comments Signed-off-by: Devin Petersohn --- python/pyspark/pandas/frame.py | 1 + .../pandas/tests/computation/test_any_all.py | 18 ++++++++++++++++++ 2 files changed, 19 insertions(+) diff --git a/python/pyspark/pandas/frame.py b/python/pyspark/pandas/frame.py index 3a8369efec86..379d3698bc09 100644 --- a/python/pyspark/pandas/frame.py +++ b/python/pyspark/pandas/frame.py @@ -11195,6 +11195,7 @@ def any( >>> df[[]].any() Series([], dtype: bool) """ + axis = validate_axis(axis) column_labels = self._internal.column_labels if bool_only: column_labels = self._bool_column_labels(column_labels) diff --git a/python/pyspark/pandas/tests/computation/test_any_all.py b/python/pyspark/pandas/tests/computation/test_any_all.py index 87d09064b54b..37966f9e0bf1 100644 --- a/python/pyspark/pandas/tests/computation/test_any_all.py +++ b/python/pyspark/pandas/tests/computation/test_any_all.py @@ -140,6 +140,24 @@ def test_any(self): self.assert_eq(psdf.any(axis=1, bool_only=True), pdf.any(axis=1, bool_only=True)) self.assert_eq(psdf.any(axis=1, bool_only=False), pdf.any(axis=1, bool_only=False)) + # Test axis='index' + self.assert_eq(psdf.any(axis="index"), pdf.any(axis="index")) + self.assert_eq( + psdf.any(axis="index", bool_only=True), pdf.any(axis="index", bool_only=True) + ) + self.assert_eq( + psdf.any(axis="index", bool_only=False), pdf.any(axis="index", bool_only=False) + ) + + # Test axis='columns' + self.assert_eq(psdf.any(axis="columns"), pdf.any(axis="columns")) + self.assert_eq( + psdf.any(axis="columns", bool_only=True), pdf.any(axis="columns", bool_only=True) + ) + self.assert_eq( + psdf.any(axis="columns", bool_only=False), pdf.any(axis="columns", bool_only=False) + ) + columns.names = ["X", "Y"] pdf.columns = columns psdf.columns = columns