format

nyaapa · nyaapa · commit 621c23d05f2e · 2025-11-21T18:50:34.000Z
diff --git a/python/pyspark/sql/pandas/serializers.py b/python/pyspark/sql/pandas/serializers.py
@@ -1466,7 +1466,9 @@ def __init__(
         self.result_state_pdf_arrow_type = to_arrow_type(
             self.result_state_df_type, prefers_large_types=prefers_large_var_types
         )
-        self.arrow_max_records_per_batch = arrow_max_records_per_batch if arrow_max_records_per_batch > 0 else 2**31 - 1
+        self.arrow_max_records_per_batch = (
+            arrow_max_records_per_batch if arrow_max_records_per_batch > 0 else 2**31 - 1
+        )
 
     def load_stream(self, stream):
         """
@@ -1821,7 +1823,9 @@ def __init__(
             int_to_decimal_coercion_enabled=int_to_decimal_coercion_enabled,
             arrow_cast=True,
         )
-        self.arrow_max_records_per_batch = arrow_max_records_per_batch if arrow_max_records_per_batch > 0 else 2**31 - 1
+        self.arrow_max_records_per_batch = (
+            arrow_max_records_per_batch if arrow_max_records_per_batch > 0 else 2**31 - 1
+        )
         self.arrow_max_bytes_per_batch = arrow_max_bytes_per_batch
         self.key_offsets = None
         self.average_arrow_row_size = 0
@@ -1836,10 +1840,7 @@ def _update_batch_size_stats(self, batch):
         # unlimited as computing batch size is computationally expensive.
         if self.arrow_max_bytes_per_batch != 2**31 - 1 and batch.num_rows > 0:
             batch_bytes = sum(
-                buf.size
-                for col in batch.columns
-                for buf in col.buffers()
-                if buf is not None
+                buf.size for col in batch.columns for buf in col.buffers() if buf is not None
             )
             self.total_bytes += batch_bytes
             self.total_rows += batch.num_rows
@@ -1997,6 +1998,7 @@ def flatten_columns(cur_batch, col_name):
              data generator. Rows in the same batch may have different grouping keys,
              but each batch will have either init_data or input_data, not mix.
             """
+
             def row_stream():
                 for batch in batches:
                     self._update_batch_size_stats(batch)
@@ -2034,21 +2036,25 @@ def row_stream():
 
                     total_len = len(rows) + len(init_state_rows)
                     if (
-                            total_len >= self.arrow_max_records_per_batch
-                            or total_len * self.average_arrow_row_size >= self.arrow_max_bytes_per_batch
+                        total_len >= self.arrow_max_records_per_batch
+                        or total_len * self.average_arrow_row_size >= self.arrow_max_bytes_per_batch
                     ):
                         yield (
                             batch_key,
                             pd.DataFrame(rows) if len(rows) > 0 else EMPTY_DATAFRAME.copy(),
-                            pd.DataFrame(init_state_rows) if len(init_state_rows) > 0 else EMPTY_DATAFRAME.copy()
+                            pd.DataFrame(init_state_rows)
+                            if len(init_state_rows) > 0
+                            else EMPTY_DATAFRAME.copy(),
                         )
                         rows = []
                         init_state_rows = []
                 if rows or init_state_rows:
                     yield (
                         batch_key,
                         pd.DataFrame(rows) if len(rows) > 0 else EMPTY_DATAFRAME.copy(),
-                        pd.DataFrame(init_state_rows) if len(init_state_rows) > 0 else EMPTY_DATAFRAME.copy()
+                        pd.DataFrame(init_state_rows)
+                        if len(init_state_rows) > 0
+                        else EMPTY_DATAFRAME.copy(),
                     )
 
         _batches = super(ArrowStreamPandasSerializer, self).load_stream(stream)
@@ -2075,7 +2081,9 @@ class TransformWithStateInPySparkRowSerializer(ArrowStreamUDFSerializer):
 
     def __init__(self, arrow_max_records_per_batch):
         super(TransformWithStateInPySparkRowSerializer, self).__init__()
-        self.arrow_max_records_per_batch = arrow_max_records_per_batch if arrow_max_records_per_batch > 0 else 2**31 - 1
+        self.arrow_max_records_per_batch = (
+            arrow_max_records_per_batch if arrow_max_records_per_batch > 0 else 2**31 - 1
+        )
         self.key_offsets = None
 
     def load_stream(self, stream):
@@ -2184,7 +2192,9 @@ def generate_data_batches(batches) -> Iterator[Tuple[Any, Optional[Any], Optiona
              into the data generator.
             """
 
-            def extract_rows(cur_batch, col_name, key_offsets) -> Optional[Iterator[Tuple[Any, Any]]]:
+            def extract_rows(
+                cur_batch, col_name, key_offsets
+            ) -> Optional[Iterator[Tuple[Any, Any]]]:
                 data_column = cur_batch.column(cur_batch.schema.get_field_index(col_name))
 
                 # Check if the entire column is null
@@ -2242,20 +2252,20 @@ def row_iterator():
         for k, g in groupby(data_batches, key=lambda x: x[0]):
             input_rows = []
             init_rows = []
-            
+
             for batch_key, input_row, init_row in g:
                 if input_row is not None:
                     input_rows.append(input_row)
                 if init_row is not None:
                     init_rows.append(init_row)
-                
+
                 total_len = len(input_rows) + len(init_rows)
                 if total_len >= self.arrow_max_records_per_batch:
                     ret_tuple = (iter(input_rows), iter(init_rows))
                     yield (TransformWithStateInPandasFuncMode.PROCESS_DATA, k, ret_tuple)
                     input_rows = []
                     init_rows = []
-            
+
             if input_rows or init_rows:
                 ret_tuple = (iter(input_rows), iter(init_rows))
                 yield (TransformWithStateInPandasFuncMode.PROCESS_DATA, k, ret_tuple)
diff --git a/python/pyspark/sql/tests/pandas/streaming/test_pandas_transform_with_state.py b/python/pyspark/sql/tests/pandas/streaming/test_pandas_transform_with_state.py
@@ -1549,9 +1549,7 @@ def check_results(batch_df, batch_id):
                 ),
             )
 
-        with self.sql_conf(
-            {"spark.sql.execution.arrow.maxRecordsPerBatch": "-1"}
-        ):
+        with self.sql_conf({"spark.sql.execution.arrow.maxRecordsPerBatch": "-1"}):
             self._test_transform_with_state_basic(
                 ChunkCountProcessorFactory(),
                 make_check_results(result_with_large_limit),
diff --git a/python/pyspark/worker.py b/python/pyspark/worker.py
@@ -3070,7 +3070,6 @@ def values_gen():
         stateful_processor_api_client = StatefulProcessorApiClient(state_server_port, key_schema)
 
         def mapper(a):
-            import pandas as pd
             mode = a[0]
 
             if mode == TransformWithStateInPandasFuncMode.PROCESS_DATA: