[SPARK-54307][SS] Throw an error if streaming query is restarted with stateful op but there is empty state dir

liviazhu · anishshri-db · commit 88671ca265ce · 2025-11-11T19:32:21.000-08:00
### What changes were proposed in this pull request? Add an error if stateful operators are in the query plan but state directory is empty. ### Why are the changes needed? Without this explicit error, user will see CANNOT_LOAD_STATE_STORE.CANNOT_READ_STREAMING_STATE_FILE which is confusing and could be mistaken for an internal error. ### Does this PR introduce _any_ user-facing change? Yes, error message will change from `CANNOT_LOAD_STATE_STORE.CANNOT_READ_STREAMING_STATE_FILE` to `STREAMING_STATEFUL_OPERATOR_MISSING_STATE_DIRECTORY` ### How was this patch tested? New unit tests ### Was this patch authored or co-authored using generative AI tooling? Generated-by: Claude 4.5 Closes #53007 from liviazhu/liviazhu-db/empty-state-dir-error. Authored-by: Livia Zhu <livia.zhu@databricks.com> Signed-off-by: Anish Shrigondekar <anish.shrigondekar@databricks.com>
diff --git a/common/utils/src/main/resources/error/error-conditions.json b/common/utils/src/main/resources/error/error-conditions.json
@@ -5771,6 +5771,15 @@
     },
     "sqlState" : "0A000"
   },
+  "STREAMING_STATEFUL_OPERATOR_MISSING_STATE_DIRECTORY" : {
+    "message" : [
+      "Cannot restart streaming query with stateful operators because the state directory is empty or missing.",
+      "Stateful operators in current batch: [<OpsInCurBatchSeq>].",
+      "This typically occurs when state files have been deleted or the streaming query was previously run without stateful operators but restarted with stateful operators.",
+      "Please remove the stateful operators, use a new checkpoint location, or restore the missing state files."
+    ],
+    "sqlState" : "42K03"
+  },
   "STREAMING_STATEFUL_OPERATOR_NOT_MATCH_IN_STATE_METADATA" : {
     "message" : [
       "Streaming stateful operator name does not match with the operator in state metadata. This likely to happen when user adds/removes/changes stateful operator of existing streaming query.",
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamingErrors.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamingErrors.scala
@@ -16,7 +16,7 @@
  */
 package org.apache.spark.sql.execution.streaming
 
-import org.apache.spark.SparkException
+import org.apache.spark.{SparkException, SparkRuntimeException}
 
 /**
  * Object for grouping error messages from streaming query exceptions
@@ -39,4 +39,17 @@ object StreamingErrors {
       cause = err
     )
   }
+
+  def statefulOperatorMissingStateDirectory(
+      opsInCurBatch: Map[Long, String]): Throwable = {
+    def formatPairString(pair: (Long, String)): String =
+      s"(OperatorId: ${pair._1} -> OperatorName: ${pair._2})"
+
+    new SparkRuntimeException(
+      errorClass = "STREAMING_STATEFUL_OPERATOR_MISSING_STATE_DIRECTORY",
+      messageParameters = Map(
+        "OpsInCurBatchSeq" -> opsInCurBatch.map(formatPairString).mkString(", ")
+      )
+    )
+  }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/runtime/IncrementalExecution.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/runtime/IncrementalExecution.scala
@@ -38,7 +38,7 @@ import org.apache.spark.sql.execution.aggregate.{HashAggregateExec, MergingSessi
 import org.apache.spark.sql.execution.datasources.v2.state.metadata.StateMetadataPartitionReader
 import org.apache.spark.sql.execution.exchange.ShuffleExchangeLike
 import org.apache.spark.sql.execution.python.streaming.{FlatMapGroupsInPandasWithStateExec, TransformWithStateInPySparkExec}
-import org.apache.spark.sql.execution.streaming.StreamingQueryPlanTraverseHelper
+import org.apache.spark.sql.execution.streaming.{StreamingErrors, StreamingQueryPlanTraverseHelper}
 import org.apache.spark.sql.execution.streaming.checkpointing.{CheckpointFileManager, OffsetSeqMetadata}
 import org.apache.spark.sql.execution.streaming.operators.stateful.{SessionWindowStateStoreRestoreExec, SessionWindowStateStoreSaveExec, StatefulOperator, StatefulOperatorStateInfo, StateStoreRestoreExec, StateStoreSaveExec, StateStoreWriter, StreamingDeduplicateExec, StreamingDeduplicateWithinWatermarkExec, StreamingGlobalLimitExec, StreamingLocalLimitExec, UpdateEventTimeColumnExec}
 import org.apache.spark.sql.execution.streaming.operators.stateful.flatmapgroupswithstate.FlatMapGroupsWithStateExec
@@ -563,6 +563,18 @@ class IncrementalExecution(
           stateStoreWriter.getStateInfo.operatorId -> stateStoreWriter.shortName
       }.toMap
 
+      // Check if state directory is empty when we have stateful operators
+      if (opMapInPhysicalPlan.nonEmpty) {
+        val stateDirPath = new Path(new Path(checkpointLocation).getParent, "state")
+        val fileManager = CheckpointFileManager.create(stateDirPath, hadoopConf)
+
+        val stateDirectoryEmpty = !fileManager.exists(stateDirPath) ||
+          fileManager.list(stateDirPath).isEmpty
+        if (stateDirectoryEmpty) {
+          throw StreamingErrors.statefulOperatorMissingStateDirectory(opMapInPhysicalPlan)
+        }
+      }
+
       // A map of all (operatorId -> operatorName) in the state metadata
       val opMapInMetadata: Map[Long, String] = {
         var ret = Map.empty[Long, String]
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/state/OperatorStateMetadataSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/state/OperatorStateMetadataSuite.scala
@@ -472,4 +472,77 @@ class OperatorStateMetadataSuite extends StreamTest with SharedSparkSession {
       }
     }
   }
+
+  test("Restart with stateful operator but empty state directory triggers error") {
+    withTempDir { checkpointDir =>
+      val inputData = MemoryStream[Int]
+      val stream = inputData.toDF()
+
+      // Run a streaming query with stateful operator
+      testStream(stream.dropDuplicates())(
+        StartStream(checkpointLocation = checkpointDir.toString),
+        AddData(inputData, 1, 2, 3),
+        ProcessAllAvailable(),
+        StopStream)
+
+      // Delete the state directory to simulate deleted state files
+      val stateDir = new Path(checkpointDir.toString, "state")
+      val fileManager = CheckpointFileManager.create(stateDir, hadoopConf)
+      fileManager.delete(stateDir)
+
+      // Restart the query - should fail with empty state directory error
+      testStream(stream.dropDuplicates())(
+        StartStream(checkpointLocation = checkpointDir.toString),
+        AddData(inputData, 4),
+        ExpectFailure[SparkRuntimeException] { t =>
+          def formatPairString(pair: (Long, String)): String =
+            s"(OperatorId: ${pair._1} -> OperatorName: ${pair._2})"
+
+          checkError(
+            t.asInstanceOf[SparkRuntimeException],
+            "STREAMING_STATEFUL_OPERATOR_MISSING_STATE_DIRECTORY",
+            "42K03",
+            Map("OpsInCurBatchSeq" -> formatPairString(0L -> "dedupe")))
+        }
+      )
+    }
+  }
+
+  test("Restart with stateful operator added to previously stateless query triggers error") {
+    withTempDir { checkpointDir =>
+      val inputData = MemoryStream[Int]
+
+      // Run a stateless streaming query first
+      testStream(inputData.toDF().select($"value" * 2 as "doubled"))(
+        StartStream(checkpointLocation = checkpointDir.toString),
+        AddData(inputData, 1, 2, 3),
+        ProcessAllAvailable(),
+        AddData(inputData, 1, 2, 3),
+        ProcessAllAvailable(),
+        StopStream)
+
+      // Delete the state directory if it exists (it shouldn't for stateless query)
+      val stateDir = new Path(checkpointDir.toString, "state")
+      val fileManager = CheckpointFileManager.create(stateDir, hadoopConf)
+      if (fileManager.exists(stateDir)) {
+        fileManager.delete(stateDir)
+      }
+
+      // Restart with a stateful operator added - should fail
+      testStream(inputData.toDF().dropDuplicates())(
+        StartStream(checkpointLocation = checkpointDir.toString),
+        AddData(inputData, 4),
+        ExpectFailure[SparkRuntimeException] { t =>
+          def formatPairString(pair: (Long, String)): String =
+            s"(OperatorId: ${pair._1} -> OperatorName: ${pair._2})"
+
+          checkError(
+            t.asInstanceOf[SparkRuntimeException],
+            "STREAMING_STATEFUL_OPERATOR_MISSING_STATE_DIRECTORY",
+            "42K03",
+            Map("OpsInCurBatchSeq" -> formatPairString(0L -> "dedupe")))
+        }
+      )
+    }
+  }
 }