[SPARK-54300][PYTHON] Optimize Py4J calls in df.toPandas

zhengruifeng · zhengruifeng · commit ca9c2565c34c · 2025-11-12T10:40:20.000+08:00
### What changes were proposed in this pull request? Optimize Py4J config calls in df.toPandas ### Why are the changes needed? In spark connect, we get all configs in a batch; in spark classic, we can do the similar optimization that all configs are fetched in a batch, so that the py4j calls can be minimized. ### Does this PR introduce _any_ user-facing change? no ### How was this patch tested? ci ### Was this patch authored or co-authored using generative AI tooling? no Closes #52994 from zhengruifeng/py4j_conf_topandas. Authored-by: Ruifeng Zheng <ruifengz@apache.org> Signed-off-by: Ruifeng Zheng <ruifengz@apache.org>
diff --git a/python/pyspark/sql/pandas/conversion.py b/python/pyspark/sql/pandas/conversion.py
@@ -71,20 +71,36 @@ def toPandas(self) -> "PandasDataFrameLike":
 
         import pandas as pd
 
-        jconf = self.sparkSession._jconf
+        (
+            sessionLocalTimeZone,
+            arrowPySparkEnabled,
+            arrowUseLargeVarTypes,
+            arrowPySparkFallbackEnabled,
+            arrowPySparkSelfDestructEnabled,
+            pandasStructHandlingMode,
+        ) = self.sparkSession._jconf.getConfs(
+            [
+                "spark.sql.session.timeZone",
+                "spark.sql.execution.arrow.pyspark.enabled",
+                "spark.sql.execution.arrow.useLargeVarTypes",
+                "spark.sql.execution.arrow.pyspark.fallback.enabled",
+                "spark.sql.execution.arrow.pyspark.selfDestruct.enabled",
+                "spark.sql.execution.pandas.structHandlingMode",
+            ]
+        )
 
-        if jconf.arrowPySparkEnabled():
+        if arrowPySparkEnabled == "true":
             use_arrow = True
             try:
                 from pyspark.sql.pandas.types import to_arrow_schema
                 from pyspark.sql.pandas.utils import require_minimum_pyarrow_version
 
                 require_minimum_pyarrow_version()
                 arrow_schema = to_arrow_schema(
-                    self.schema, prefers_large_types=jconf.arrowUseLargeVarTypes()
+                    self.schema, prefers_large_types=arrowUseLargeVarTypes == "true"
                 )
             except Exception as e:
-                if jconf.arrowPySparkFallbackEnabled():
+                if arrowPySparkFallbackEnabled == "true":
                     msg = (
                         "toPandas attempted Arrow optimization because "
                         "'spark.sql.execution.arrow.pyspark.enabled' is set to true; however, "
@@ -112,7 +128,7 @@ def toPandas(self) -> "PandasDataFrameLike":
                 try:
                     import pyarrow as pa
 
-                    self_destruct = jconf.arrowPySparkSelfDestructEnabled()
+                    self_destruct = arrowPySparkSelfDestructEnabled == "true"
                     batches = self._collect_as_arrow(split_batches=self_destruct)
 
                     # Rename columns to avoid duplicated column names.
@@ -148,8 +164,8 @@ def toPandas(self) -> "PandasDataFrameLike":
                         )
 
                     if len(self.columns) > 0:
-                        timezone = jconf.sessionLocalTimeZone()
-                        struct_in_pandas = jconf.pandasStructHandlingMode()
+                        timezone = sessionLocalTimeZone
+                        struct_in_pandas = pandasStructHandlingMode
 
                         error_on_duplicated_field_names = False
                         if struct_in_pandas == "legacy":
@@ -200,8 +216,8 @@ def toPandas(self) -> "PandasDataFrameLike":
             pdf = pd.DataFrame(columns=self.columns)
 
         if len(pdf.columns) > 0:
-            timezone = jconf.sessionLocalTimeZone()
-            struct_in_pandas = jconf.pandasStructHandlingMode()
+            timezone = sessionLocalTimeZone
+            struct_in_pandas = pandasStructHandlingMode
 
             return pd.concat(
                 [
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -7886,6 +7886,12 @@ class SQLConf extends Serializable with Logging with SqlApiConf {
     }
   }
 
+  /** Return the value of Spark SQL configuration property for the given keys. */
+  @throws[NoSuchElementException]("if key is not set")
+  private[spark] def getConfs(keys: util.List[String]): Array[String] = {
+    Array.tabulate(keys.size())(i => this.getConfString(keys.get(i)))
+  }
+
   /**
    * Return all the configuration properties that have been set (i.e. not the default).
    * This creates a new copy of the config properties in the form of a Map.

Original file line number	Diff line number	Diff line change
`@@ -7886,6 +7886,12 @@ class SQLConf extends Serializable with Logging with SqlApiConf {`
`7886`	`7886`	`}`
`7887`	`7887`	`}`
`7888`	`7888`
	`7889`	`+ /** Return the value of Spark SQL configuration property for the given keys. */`
	`7890`	`+ @throws[NoSuchElementException]("if key is not set")`
	`7891`	`+ private[spark] def getConfs(keys: util.List[String]): Array[String] = {`
	`7892`	`+ Array.tabulate(keys.size())(i => this.getConfString(keys.get(i)))`
	`7893`	`+ }`
	`7894`	`+`
`7889`	`7895`	`/**`
`7890`	`7896`	`* Return all the configuration properties that have been set (i.e. not the default).`
`7891`	`7897`	`* This creates a new copy of the config properties in the form of a Map.`