add test for internal column

zifeif2 · zifeif2 · commit c474783ec1be · 2025-12-11T08:27:30.000Z
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/state/StateDataSource.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/state/StateDataSource.scala
@@ -37,7 +37,7 @@ import org.apache.spark.sql.execution.streaming.checkpointing.OffsetSeqMetadata
 import org.apache.spark.sql.execution.streaming.operators.stateful.StatefulOperatorsUtils
 import org.apache.spark.sql.execution.streaming.operators.stateful.join.StreamingSymmetricHashJoinHelper.{LeftSide, RightSide}
 import org.apache.spark.sql.execution.streaming.operators.stateful.join.SymmetricHashJoinStateManager
-import org.apache.spark.sql.execution.streaming.operators.stateful.transformwithstate.{TransformWithStateOperatorProperties, TransformWithStateVariableInfo}
+import org.apache.spark.sql.execution.streaming.operators.stateful.transformwithstate.{StateStoreColumnFamilySchemaUtils, StateVariableType, TransformWithStateOperatorProperties, TransformWithStateVariableInfo}
 import org.apache.spark.sql.execution.streaming.operators.stateful.transformwithstate.timers.TimerStateUtils
 import org.apache.spark.sql.execution.streaming.runtime.StreamingCheckpointConstants.DIR_NAME_STATE
 import org.apache.spark.sql.execution.streaming.runtime.StreamingQueryCheckpointMetadata
@@ -193,7 +193,8 @@ class StateDataSource extends TableProvider with DataSourceRegister with Logging
 
       val stateVars = twsOperatorProperties.stateVariables
       val stateVarInfo = stateVars.filter(stateVar => stateVar.stateName == stateVarName)
-      if (stateVarInfo.size != 1) {
+      if (stateVarInfo.size != 1 &&
+        !StateStoreColumnFamilySchemaUtils.isInternalColFamilyTestOnly(stateVarName)) {
         throw StateDataSourceErrors.invalidOptionValue(STATE_VAR_NAME,
           s"State variable $stateVarName is not defined for the transformWithState operator.")
       }
@@ -293,8 +294,15 @@ class StateDataSource extends TableProvider with DataSourceRegister with Logging
           if (sourceOptions.internalOnlyReadAllColumnFamilies) {
             stateVariableInfos = operatorProperties.stateVariables
           } else {
-            val stateVarInfoList = operatorProperties.stateVariables
+            var stateVarInfoList = operatorProperties.stateVariables
               .filter(stateVar => stateVar.stateName == stateVarName)
+            if (stateVarInfoList.isEmpty &&
+              StateStoreColumnFamilySchemaUtils.isInternalColFamilyTestOnly(stateVarName)) {
+              // pass this dummy TWSStateVariableInfo for TWS internal column family during testing,
+              stateVarInfoList = List(TransformWithStateVariableInfo(
+                stateVarName, StateVariableType.ValueState, false
+              ))
+            }
             require(stateVarInfoList.size == 1, s"Failed to find unique state variable info " +
               s"for state variable $stateVarName in operator ${sourceOptions.operatorId}")
             val stateVarInfo = stateVarInfoList.head
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/state/StatePartitionReader.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/state/StatePartitionReader.scala
@@ -22,7 +22,7 @@ import org.apache.spark.sql.catalyst.expressions.{GenericInternalRow, UnsafeRow}
 import org.apache.spark.sql.connector.read.{InputPartition, PartitionReader, PartitionReaderFactory}
 import org.apache.spark.sql.execution.datasources.v2.state.utils.SchemaUtil
 import org.apache.spark.sql.execution.streaming.operators.stateful.join.SymmetricHashJoinStateManager
-import org.apache.spark.sql.execution.streaming.operators.stateful.transformwithstate.{StateVariableType, TransformWithStateVariableInfo}
+import org.apache.spark.sql.execution.streaming.operators.stateful.transformwithstate.{StateStoreColumnFamilySchemaUtils, StateVariableType, TransformWithStateVariableInfo}
 import org.apache.spark.sql.execution.streaming.state._
 import org.apache.spark.sql.execution.streaming.state.RecordType.{getRecordTypeAsString, RecordType}
 import org.apache.spark.sql.types.{NullType, StructField, StructType}
@@ -143,14 +143,15 @@ abstract class StatePartitionReaderBase(
       useColumnFamilies = useColFamilies, storeConf, hadoopConf.value,
       useMultipleValuesPerKey = useMultipleValuesPerKey, stateSchemaProviderOpt)
 
-    val isInternal = partition.sourceOptions.readRegisteredTimers
-
     if (useColFamilies) {
       val store = provider.getStore(
         partition.sourceOptions.batchId + 1,
         getEndStoreUniqueId)
       require(stateStoreColFamilySchemaOpt.isDefined)
       val stateStoreColFamilySchema = stateStoreColFamilySchemaOpt.get
+      val isInternal = partition.sourceOptions.readRegisteredTimers ||
+        StateStoreColumnFamilySchemaUtils.isInternalColFamilyTestOnly(
+          stateStoreColFamilySchema.colFamilyName)
       require(stateStoreColFamilySchema.keyStateEncoderSpec.isDefined)
       store.createColFamilyIfAbsent(
         stateStoreColFamilySchema.colFamilyName,
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/operators/stateful/transformwithstate/StateStoreColumnFamilySchemaUtils.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/operators/stateful/transformwithstate/StateStoreColumnFamilySchemaUtils.scala
@@ -99,6 +99,17 @@ object StateStoreColumnFamilySchemaUtils {
   def getStateNameFromCountIndexCFName(colFamilyName: String): String =
     getStateName(COUNT_INDEX_PREFIX, colFamilyName)
 
+  /**
+   * Returns true if the column family is internal (starts with "$") and we are in testing mode.
+   * This is used to allow internal column families to be read during tests.
+   *
+   * @param colFamilyName The name of the column family to check
+   * @return true if this is an internal column family and Utils.isTesting is true
+   */
+  def isInternalColFamilyTestOnly(colFamilyName: String): Boolean = {
+    org.apache.spark.util.Utils.isTesting && colFamilyName.startsWith("$")
+  }
+
   def getValueStateSchema[T](
       stateName: String,
       keyEncoder: ExpressionEncoder[Any],
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/v2/state/StatePartitionAllColumnFamiliesReaderSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/v2/state/StatePartitionAllColumnFamiliesReaderSuite.scala
@@ -826,22 +826,38 @@ class StatePartitionAllColumnFamiliesReaderSuite extends StateDataSourceTestBase
         "listState",
         groupByKeySchema,
         listStateValueSchema)
-
-      // Validate that TTL-related column families have the expected number of entries
-      val ttlIndexRows = allBytesData.filter(_.getString(3) == "$ttl_listState")
-      val minExpiryRows = allBytesData.filter(_.getString(3) == "$min_listState")
-      val countIndexRows = allBytesData.filter(_.getString(3) == "$count_listState")
-
-      // We have 2 grouping keys (a, b), so each secondary index should have entries
-      // TTL index has one entry per unique (expirationMs, groupingKey) pair
-      // Min expiry and count indexes have one entry per grouping key
-      assert(minExpiryRows.length == 2,
-        s"Expected 2 min expiry entries (one per key), got ${minExpiryRows.length}")
-      assert(countIndexRows.length == 2,
-        s"Expected 2 count index entries (one per key), got ${countIndexRows.length}")
-      // TTL index entries depend on batching - we processed 2 batches with different timestamps
-      assert(ttlIndexRows.length >= 2,
-        s"Expected at least 2 TTL index entries, got ${ttlIndexRows.length}")
+      val dummyValueSchema = StructType(Array(StructField("__dummy__", NullType)))
+      val ttlIndexKeySchema = StructType(Array(
+        StructField("expirationMs", LongType, nullable = false),
+        StructField("elementKey", groupByKeySchema)
+      ))
+      val minExpiryValueSchema = StructType(Array(
+        StructField("minExpiry", LongType)
+      ))
+      val countValueSchema = StructType(Array(
+        StructField("count", LongType)
+      ))
+      val columnFamilyAndKeyValueSchema = Seq(
+        ("$ttl_listState", ttlIndexKeySchema, dummyValueSchema),
+        ("$min_listState", groupByKeySchema, minExpiryValueSchema),
+        ("$count_listState", groupByKeySchema, countValueSchema)
+      )
+      columnFamilyAndKeyValueSchema.foreach(pair => {
+        val normalDf = spark.read
+          .format("statestore")
+          .option(StateSourceOptions.PATH, tempDir.getAbsolutePath)
+          .option(StateSourceOptions.STATE_VAR_NAME, pair._1)
+          .load()
+          .selectExpr("partition_id", "key", "value")
+
+        compareNormalAndBytesData(
+          normalDf.collect(),
+          allBytesData,
+          pair._1,
+          pair._2,
+          pair._3)
+      }
+      )
     }
   }