[SPARK-54472][SQL] Add ORC read and write support for TIME type

vinodkc · dongjoon-hyun · commit 4d1c79fe8f13 · 2025-11-23T14:11:31.000-08:00
### What changes were proposed in this pull request? This PR adds ORC serialization and deserialization support for Spark's TIME type. ### Why are the changes needed? TIME type currently lacks ORC support, preventing users from: - Reading/writing ORC files with TIME columns - Integrating TIME data with existing ORC-based data lakes - Preserving TIME precision in columnar storage ### Does this PR introduce _any_ user-facing change? Yes. Users can now: 1. Read ORC with TIME columns ```scala spark.read.format("orc").load("data.orc") // Returns DataFrame with TIME columns preserved ``` 2. Write DataFrames with TIME to ORC ```scala val df = spark.sql("SELECT TIME'14:30:45.123456' as shift_start") df.write.format("orc").save("output.orc") ``` ### Technical Details #### Storage Format ``` ORC Column: Physical Type: LONG (nanoseconds since midnight) Custom Attribute: spark.sql.catalyst.type = "time(<precision>)" Value Range: 0 to 86,399,999,999,999 ``` #### Precision Handling | Precision | Catalyst Type | ORC Attribute | Example Value | |-----------|---------------|---------------|---------------| | 0 (seconds) | `TimeType(0)` | `"time(0)"` | `12:34:56` | | 3 (millis) | `TimeType(3)` | `"time(3)"` | `12:34:56.123` | | 6 (micros) | `TimeType(6)` | `"time(6)"` | `12:34:56.123456` | ***Future Compatibility*** - Versioned via file metadata: Uses existing `org.apache.spark.version` for compatibility - Forward-compatible: If ORC adds native TIME type, can migrate based on version ### How was this patch tested? Added tests in `OrcQuerySuite` ### Was this patch authored or co-authored using generative AI tooling? Yes. Generated-by: Claude 3.5 Sonnet AI assistance was used for: - Code pattern analysis and design discussions - Implementation guidance following Spark conventions - Test case generation and organization - Documentation and examples Closes #53185 from vinodkc/br_time_orc_read_write. Authored-by: vinodkc <vinod.kc.in@gmail.com> Signed-off-by: Dongjoon Hyun <dongjoon@apache.org>
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcDeserializer.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcDeserializer.scala
@@ -126,8 +126,8 @@ class OrcDeserializer(
       case IntegerType | _: YearMonthIntervalType => (ordinal, value) =>
         updater.setInt(ordinal, value.asInstanceOf[IntWritable].get)
 
-      case LongType | _: DayTimeIntervalType | _: TimestampNTZType => (ordinal, value) =>
-        updater.setLong(ordinal, value.asInstanceOf[LongWritable].get)
+      case LongType | _: DayTimeIntervalType | _: TimestampNTZType | _: TimeType =>
+        (ordinal, value) => updater.setLong(ordinal, value.asInstanceOf[LongWritable].get)
 
       case FloatType => (ordinal, value) =>
         updater.setFloat(ordinal, value.asInstanceOf[FloatWritable].get)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcFileFormat.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcFileFormat.scala
@@ -249,7 +249,6 @@ class OrcFileFormat
   override def supportDataType(dataType: DataType): Boolean = dataType match {
     case _: VariantType => false
 
-    case _: TimeType => false
     case _: AtomicType => true
 
     case st: StructType => st.forall { f => supportDataType(f.dataType) }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcSerializer.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcSerializer.scala
@@ -98,7 +98,7 @@ class OrcSerializer(dataSchema: StructType) {
       }
 
 
-    case LongType | _: DayTimeIntervalType | _: TimestampNTZType =>
+    case LongType | _: DayTimeIntervalType | _: TimestampNTZType | _: TimeType =>
       if (reuseObj) {
         val result = new LongWritable()
         (getter, ordinal) =>
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcUtils.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcUtils.scala
@@ -282,7 +282,7 @@ object OrcUtils extends Logging {
       s"array<${getOrcSchemaString(a.elementType)}>"
     case m: MapType =>
       s"map<${getOrcSchemaString(m.keyType)},${getOrcSchemaString(m.valueType)}>"
-    case _: DayTimeIntervalType | _: TimestampNTZType => LongType.catalogString
+    case _: DayTimeIntervalType | _: TimestampNTZType | _: TimeType => LongType.catalogString
     case _: YearMonthIntervalType => IntegerType.catalogString
     case _ => dt.catalogString
   }
@@ -302,6 +302,10 @@ object OrcUtils extends Logging {
           val typeDesc = new TypeDescription(TypeDescription.Category.LONG)
           typeDesc.setAttribute(CATALYST_TYPE_ATTRIBUTE_NAME, n.typeName)
           Some(typeDesc)
+        case tm: TimeType =>
+          val typeDesc = new TypeDescription(TypeDescription.Category.LONG)
+          typeDesc.setAttribute(CATALYST_TYPE_ATTRIBUTE_NAME, tm.typeName)
+          Some(typeDesc)
         case t: TimestampType =>
           val typeDesc = new TypeDescription(TypeDescription.Category.TIMESTAMP)
           typeDesc.setAttribute(CATALYST_TYPE_ATTRIBUTE_NAME, t.typeName)
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/FileBasedDataSourceSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/FileBasedDataSourceSuite.scala
@@ -1247,7 +1247,7 @@ class FileBasedDataSourceSuite extends QueryTest
   }
 
   test("SPARK-51590: unsupported the TIME data types in data sources") {
-    val datasources = Seq("orc", "text")
+    val datasources = Seq("text")
     Seq(true, false).foreach { useV1 =>
       val useV1List = if (useV1) {
         datasources.mkString(",")
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/orc/OrcQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/orc/OrcQuerySuite.scala
@@ -899,6 +899,54 @@ abstract class OrcQuerySuite extends OrcQueryTest with SharedSparkSession {
       }
     }
   }
+
+  test("TIME type support for ORC format") {
+    withTempPath { dir =>
+      val path = dir.getCanonicalPath
+      val df = spark.sql("""
+        SELECT
+          id,
+          TIME'09:30:00' as morning,
+          TIME'14:45:30.123456' as afternoon,
+          TIME'23:59:59.999999' as end_of_day,
+          TIME'00:00:00' as midnight,
+          CASE WHEN id % 2 = 0 THEN TIME'12:30:00' ELSE NULL END as nullable_time
+        FROM VALUES (1), (2), (3) AS t(id)
+      """)
+
+      df.write.mode("overwrite").orc(path)
+      val result = spark.read.orc(path)
+
+      Seq("morning", "afternoon", "end_of_day", "midnight", "nullable_time").foreach { col =>
+        assert(result.schema(col).dataType == TimeType(6))
+      }
+      checkAnswer(result, df)
+    }
+  }
+
+  test("TIME type with different precisions in ORC") {
+    withTempPath { dir =>
+      val path = dir.getCanonicalPath
+      val df = spark.sql("""
+        SELECT
+          CAST(TIME'12:34:56' AS TIME(0)) as time_p0,
+          CAST(TIME'12:34:56.1' AS TIME(1)) as time_p1,
+          CAST(TIME'12:34:56.12' AS TIME(2)) as time_p2,
+          CAST(TIME'12:34:56.123' AS TIME(3)) as time_p3,
+          CAST(TIME'12:34:56.1234' AS TIME(4)) as time_p4,
+          CAST(TIME'12:34:56.12345' AS TIME(5)) as time_p5,
+          CAST(TIME'12:34:56.123456' AS TIME(6)) as time_p6
+      """)
+
+      df.write.mode("overwrite").orc(path)
+      val result = spark.read.orc(path)
+
+      (0 to 6).foreach { p =>
+        assert(result.schema(s"time_p$p").dataType == TimeType(p))
+      }
+      checkAnswer(result, df)
+    }
+  }
 }
 
 class OrcV1QuerySuite extends OrcQuerySuite {

Original file line number	Diff line number	Diff line change
`@@ -98,7 +98,7 @@ class OrcSerializer(dataSchema: StructType) {`
`98`	`98`	`}`
`99`	`99`
`100`	`100`
`101`		`- case LongType \| _: DayTimeIntervalType \| _: TimestampNTZType =>`
	`101`	`+ case LongType \| _: DayTimeIntervalType \| _: TimestampNTZType \| _: TimeType =>`
`102`	`102`	`if (reuseObj) {`
`103`	`103`	`val result = new LongWritable()`
`104`	`104`	`(getter, ordinal) =>`
Original file line number	Diff line number	Diff line change
`@@ -1247,7 +1247,7 @@ class FileBasedDataSourceSuite extends QueryTest`
`1247`	`1247`	`}`
`1248`	`1248`
`1249`	`1249`	`test("SPARK-51590: unsupported the TIME data types in data sources") {`
`1250`		`- val datasources = Seq("orc", "text")`
	`1250`	`+ val datasources = Seq("text")`
`1251`	`1251`	`Seq(true, false).foreach { useV1 =>`
`1252`	`1252`	`val useV1List = if (useV1) {`
`1253`	`1253`	`datasources.mkString(",")`