treat non-positive arrowMaxRecordsPerBatch as unlimited

nyaapa · nyaapa · commit fcde4f9261cb · 2025-11-20T21:48:43.000Z
diff --git a/python/pyspark/sql/pandas/serializers.py b/python/pyspark/sql/pandas/serializers.py
@@ -1466,7 +1466,7 @@ def __init__(
         self.result_state_pdf_arrow_type = to_arrow_type(
             self.result_state_df_type, prefers_large_types=prefers_large_var_types
         )
-        self.arrow_max_records_per_batch = arrow_max_records_per_batch
+        self.arrow_max_records_per_batch = arrow_max_records_per_batch if arrow_max_records_per_batch > 0 else 2**31 - 1
 
     def load_stream(self, stream):
         """
@@ -1821,13 +1821,30 @@ def __init__(
             int_to_decimal_coercion_enabled=int_to_decimal_coercion_enabled,
             arrow_cast=True,
         )
-        self.arrow_max_records_per_batch = arrow_max_records_per_batch
+        self.arrow_max_records_per_batch = arrow_max_records_per_batch if arrow_max_records_per_batch > 0 else 2**31 - 1
         self.arrow_max_bytes_per_batch = arrow_max_bytes_per_batch
         self.key_offsets = None
         self.average_arrow_row_size = 0
         self.total_bytes = 0
         self.total_rows = 0
 
+    def _update_batch_size_stats(self, batch):
+        """
+        Update batch size statistics for adaptive batching.
+        """
+        # Short circuit batch size calculation if the batch size is
+        # unlimited as computing batch size is computationally expensive.
+        if self.arrow_max_bytes_per_batch != 2**31 - 1 and batch.num_rows > 0:
+            batch_bytes = sum(
+                buf.size
+                for col in batch.columns
+                for buf in col.buffers()
+                if buf is not None
+            )
+            self.total_bytes += batch_bytes
+            self.total_rows += batch.num_rows
+            self.average_arrow_row_size = self.total_bytes / self.total_rows
+
     def load_stream(self, stream):
         """
         Read ArrowRecordBatches from stream, deserialize them to populate a list of data chunk, and
@@ -1855,18 +1872,7 @@ def generate_data_batches(batches):
 
             def row_stream():
                 for batch in batches:
-                    # Short circuit batch size calculation if the batch size is
-                    # unlimited as computing batch size is computationally expensive.
-                    if self.arrow_max_bytes_per_batch != 2**31 - 1 and batch.num_rows > 0:
-                        batch_bytes = sum(
-                            buf.size
-                            for col in batch.columns
-                            for buf in col.buffers()
-                            if buf is not None
-                        )
-                        self.total_bytes += batch_bytes
-                        self.total_rows += batch.num_rows
-                        self.average_arrow_row_size = self.total_bytes / self.total_rows
+                    self._update_batch_size_stats(batch)
                     data_pandas = [
                         self.arrow_to_pandas(c, i)
                         for i, c in enumerate(pa.Table.from_batches([batch]).itercolumns())
@@ -1993,16 +1999,7 @@ def flatten_columns(cur_batch, col_name):
             """
             def row_stream():
                 for batch in batches:
-                    if self.arrow_max_bytes_per_batch != 2**31 - 1 and batch.num_rows > 0:
-                        batch_bytes = sum(
-                            buf.size
-                            for col in batch.columns
-                            for buf in col.buffers()
-                            if buf is not None
-                        )
-                        self.total_bytes += batch_bytes
-                        self.total_rows += batch.num_rows
-                        self.average_arrow_row_size = self.total_bytes / self.total_rows
+                    self._update_batch_size_stats(batch)
 
                     flatten_state_table = flatten_columns(batch, "inputData")
                     data_pandas = [
@@ -2078,7 +2075,7 @@ class TransformWithStateInPySparkRowSerializer(ArrowStreamUDFSerializer):
 
     def __init__(self, arrow_max_records_per_batch):
         super(TransformWithStateInPySparkRowSerializer, self).__init__()
-        self.arrow_max_records_per_batch = arrow_max_records_per_batch
+        self.arrow_max_records_per_batch = arrow_max_records_per_batch if arrow_max_records_per_batch > 0 else 2**31 - 1
         self.key_offsets = None
 
     def load_stream(self, stream):
diff --git a/python/pyspark/sql/tests/pandas/streaming/test_pandas_transform_with_state.py b/python/pyspark/sql/tests/pandas/streaming/test_pandas_transform_with_state.py
@@ -1483,6 +1483,98 @@ def check_results(batch_df, batch_id):
                 ),
             )
 
+    def test_transform_with_state_with_records_limit(self):
+        if not self.use_pandas():
+            return
+
+        def make_check_results(expected_per_batch):
+            def check_results(batch_df, batch_id):
+                batch_df.collect()
+                if batch_id == 0:
+                    assert set(batch_df.sort("id").collect()) == expected_per_batch[0]
+                else:
+                    assert set(batch_df.sort("id").collect()) == expected_per_batch[1]
+
+            return check_results
+
+        result_with_small_limit = [
+            {
+                Row(id="0", chunkCount=2),
+                Row(id="1", chunkCount=2),
+            },
+            {
+                Row(id="0", chunkCount=3),
+                Row(id="1", chunkCount=2),
+            },
+        ]
+
+        result_with_large_limit = [
+            {
+                Row(id="0", chunkCount=1),
+                Row(id="1", chunkCount=1),
+            },
+            {
+                Row(id="0", chunkCount=1),
+                Row(id="1", chunkCount=1),
+            },
+        ]
+
+        data = [("0", 789), ("3", 987)]
+        initial_state = self.spark.createDataFrame(data, "id string, initVal int").groupBy("id")
+
+        with self.sql_conf(
+            # Set it to a very small number so that every row would be a separate pandas df
+            {"spark.sql.execution.arrow.maxRecordsPerBatch": "1"}
+        ):
+            self._test_transform_with_state_basic(
+                ChunkCountProcessorFactory(),
+                make_check_results(result_with_small_limit),
+                output_schema=StructType(
+                    [
+                        StructField("id", StringType(), True),
+                        StructField("chunkCount", IntegerType(), True),
+                    ]
+                ),
+            )
+
+            self._test_transform_with_state_basic(
+                ChunkCountProcessorWithInitialStateFactory(),
+                make_check_results(result_with_small_limit),
+                initial_state=initial_state,
+                output_schema=StructType(
+                    [
+                        StructField("id", StringType(), True),
+                        StructField("chunkCount", IntegerType(), True),
+                    ]
+                ),
+            )
+
+        with self.sql_conf(
+            {"spark.sql.execution.arrow.maxRecordsPerBatch": "-1"}
+        ):
+            self._test_transform_with_state_basic(
+                ChunkCountProcessorFactory(),
+                make_check_results(result_with_large_limit),
+                output_schema=StructType(
+                    [
+                        StructField("id", StringType(), True),
+                        StructField("chunkCount", IntegerType(), True),
+                    ]
+                ),
+            )
+
+            self._test_transform_with_state_basic(
+                ChunkCountProcessorWithInitialStateFactory(),
+                make_check_results(result_with_large_limit),
+                initial_state=initial_state,
+                output_schema=StructType(
+                    [
+                        StructField("id", StringType(), True),
+                        StructField("chunkCount", IntegerType(), True),
+                    ]
+                ),
+            )
+
     # test all state types (value, list, map) with large values (512 KB)
     def test_transform_with_state_large_values(self):
         def check_results(batch_df, batch_id):
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/streaming/BaseStreamingArrowWriter.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/streaming/BaseStreamingArrowWriter.scala
@@ -88,7 +88,7 @@ class BaseStreamingArrowWriter(
 
   protected def isBatchSizeLimitReached: Boolean = {
     // If we have either reached the records or bytes limit
-    totalNumRowsForBatch >= arrowMaxRecordsPerBatch ||
+    (arrowMaxRecordsPerBatch > 0 && totalNumRowsForBatch >= arrowMaxRecordsPerBatch) ||
       // Short circuit batch size calculation if the batch size is unlimited as computing batch
       // size is computationally expensive.
       ((arrowMaxBytesPerBatch != Int.MaxValue)
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/python/streaming/BaseStreamingArrowWriterSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/python/streaming/BaseStreamingArrowWriterSuite.scala
@@ -95,4 +95,43 @@ class BaseStreamingArrowWriterSuite extends SparkFunSuite with BeforeAndAfterEac
     verify(writer, times(2)).writeBatch()
     verify(arrowWriter, times(2)).reset()
   }
+
+  test("test negative or zero arrowMaxRecordsPerBatch is unlimited") {
+    val root: VectorSchemaRoot = mock(classOf[VectorSchemaRoot])
+    val dataRow = mock(classOf[InternalRow])
+
+    // Test with negative value
+    transformWithStateInPySparkWriter = new BaseStreamingArrowWriter(
+      root, writer, -1, arrowMaxBytesPerBatch, arrowWriter)
+
+    // Write many rows (more than typical batch size)
+    for (_ <- 1 to 10) {
+      transformWithStateInPySparkWriter.writeRow(dataRow)
+    }
+
+    // Verify all rows were written but batch was not finalized
+    verify(arrowWriter, times(10)).write(dataRow)
+    verify(writer, never()).writeBatch()
+
+    // Only finalize when explicitly called
+    transformWithStateInPySparkWriter.finalizeCurrentArrowBatch()
+    verify(writer).writeBatch()
+
+    // Test with zero value
+    transformWithStateInPySparkWriter = new BaseStreamingArrowWriter(
+      root, writer, 0, arrowMaxBytesPerBatch, arrowWriter)
+
+    // Write many rows again
+    for (_ <- 1 to 10) {
+      transformWithStateInPySparkWriter.writeRow(dataRow)
+    }
+
+    // Verify rows were written but batch was not finalized
+    verify(arrowWriter, times(20)).write(dataRow)
+    verify(writer).writeBatch()  // still 1 from before
+
+    // Only finalize when explicitly called
+    transformWithStateInPySparkWriter.finalizeCurrentArrowBatch()
+    verify(writer, times(2)).writeBatch()
+  }
 }