Skip to content

Commit daa29f6

Browse files
srielaucloud-fan
authored andcommitted
[SPARK-53573][SQL] IDENTIFIER everywhere
### What changes were proposed in this pull request? We propose expanding the IDENTIFIER() clause, which turns a string into a qualified identifier, to all places identifiers can appear. The current clause is severely limited in where it can go because it accepts constant expressions, including session variables. Due to the complexity of the argument the existing clause requires tricky code to incrementally analyze its arguments and then execute sections of parser code at a later point. By contrast the generalized IDENTIFIER clause only allows string literals which can be processed in the visitor methods. Due to the rework of parameter markers and string coalescing this allows for constructs such as: ``` SELECT * FROM IDENTIFIER(:cat '.' :schema '.' :table) ``` it even allows: ``` SELECT 'hello' AS IDENTIFIER(:alias); ``` This is really all identifier() needs. We may be able to deprecate and de-support the existing too complex identifier() implementation. ### Why are the changes needed? IDENTIFIER() is a popular feature, but it can only be used in very specific, hard to reason about places. The new implementation preserved 99% of teh fucntion while expanding its use to everywhere. ### Does this PR introduce _any_ user-facing change? Yes, it's a new feature ### How was this patch tested? expanded Parameters and inentifier-clause test suites. ### Was this patch authored or co-authored using generative AI tooling? Yes, Clause Sonnet 4.5 Closes #52765 from srielau/identifier-lite. Authored-by: Serge Rielau <[email protected]> Signed-off-by: Wenchen Fan <[email protected]>
1 parent 5c16a73 commit daa29f6

File tree

33 files changed

+8155
-352
lines changed

33 files changed

+8155
-352
lines changed

common/utils/src/main/resources/error/error-conditions.json

Lines changed: 1 addition & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -2064,7 +2064,7 @@
20642064
},
20652065
"IDENTIFIER_TOO_MANY_NAME_PARTS" : {
20662066
"message" : [
2067-
"<identifier> is not a valid identifier as it has more than 2 name parts."
2067+
"<identifier> is not a valid identifier as it has more than <limit> name parts."
20682068
],
20692069
"sqlState" : "42601"
20702070
},
@@ -8650,11 +8650,6 @@
86508650
"Failed to merge incompatible schemas <left> and <right>."
86518651
]
86528652
},
8653-
"_LEGACY_ERROR_TEMP_2096" : {
8654-
"message" : [
8655-
"<ddl> is not supported temporarily."
8656-
]
8657-
},
86588653
"_LEGACY_ERROR_TEMP_2097" : {
86598654
"message" : [
86608655
"Could not execute broadcast in <timeout> secs. You can increase the timeout for broadcasts via <broadcastTimeout> or disable broadcast join by setting <autoBroadcastJoinThreshold> to -1 or remove the broadcast hint if it exists in your code."

sql/api/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBaseParser.g4

Lines changed: 37 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,13 @@ options { tokenVocab = SqlBaseLexer; }
4646
* When true, parameter markers are allowed everywhere a literal is supported.
4747
*/
4848
public boolean parameter_substitution_enabled = true;
49+
50+
/**
51+
* When false (default), IDENTIFIER('literal') is resolved to an identifier at parse time (identifier-lite).
52+
* When true, only the legacy IDENTIFIER(expression) function syntax is allowed.
53+
* Controlled by spark.sql.legacy.identifierClause configuration.
54+
*/
55+
public boolean legacy_identifier_clause_only = false;
4956
}
5057

5158
compoundOrSingleStatement
@@ -92,7 +99,7 @@ sqlStateValue
9299
;
93100

94101
declareConditionStatement
95-
: DECLARE multipartIdentifier CONDITION (FOR SQLSTATE VALUE? sqlStateValue)?
102+
: DECLARE strictIdentifier CONDITION (FOR SQLSTATE VALUE? sqlStateValue)?
96103
;
97104

98105
conditionValue
@@ -125,11 +132,11 @@ repeatStatement
125132
;
126133

127134
leaveStatement
128-
: LEAVE multipartIdentifier
135+
: LEAVE strictIdentifier
129136
;
130137

131138
iterateStatement
132-
: ITERATE multipartIdentifier
139+
: ITERATE strictIdentifier
133140
;
134141

135142
caseStatement
@@ -144,19 +151,19 @@ loopStatement
144151
;
145152

146153
forStatement
147-
: beginLabel? FOR (multipartIdentifier AS)? query DO compoundBody END FOR endLabel?
154+
: beginLabel? FOR (strictIdentifier AS)? query DO compoundBody END FOR endLabel?
148155
;
149156

150157
singleStatement
151158
: (statement|setResetStatement) SEMICOLON* EOF
152159
;
153160

154161
beginLabel
155-
: multipartIdentifier COLON
162+
: strictIdentifier COLON
156163
;
157164

158165
endLabel
159-
: multipartIdentifier
166+
: strictIdentifier
160167
;
161168

162169
singleExpression
@@ -321,7 +328,7 @@ statement
321328
| SHOW VIEWS ((FROM | IN) identifierReference)?
322329
(LIKE? pattern=stringLit)? #showViews
323330
| SHOW PARTITIONS identifierReference partitionSpec? #showPartitions
324-
| SHOW identifier? FUNCTIONS ((FROM | IN) ns=identifierReference)?
331+
| SHOW functionScope=simpleIdentifier? FUNCTIONS ((FROM | IN) ns=identifierReference)?
325332
(LIKE? (legacy=multipartIdentifier | pattern=stringLit))? #showFunctions
326333
| SHOW PROCEDURES ((FROM | IN) identifierReference)? #showProcedures
327334
| SHOW CREATE TABLE identifierReference (AS SERDE)? #showCreateTable
@@ -833,8 +840,8 @@ hint
833840
;
834841

835842
hintStatement
836-
: hintName=identifier
837-
| hintName=identifier LEFT_PAREN parameters+=primaryExpression (COMMA parameters+=primaryExpression)* RIGHT_PAREN
843+
: hintName=simpleIdentifier
844+
| hintName=simpleIdentifier LEFT_PAREN parameters+=primaryExpression (COMMA parameters+=primaryExpression)* RIGHT_PAREN
838845
;
839846

840847
fromClause
@@ -1241,7 +1248,7 @@ primaryExpression
12411248
| identifier #columnReference
12421249
| base=primaryExpression DOT fieldName=identifier #dereference
12431250
| LEFT_PAREN expression RIGHT_PAREN #parenthesizedExpression
1244-
| EXTRACT LEFT_PAREN field=identifier FROM source=valueExpression RIGHT_PAREN #extract
1251+
| EXTRACT LEFT_PAREN field=simpleIdentifier FROM source=valueExpression RIGHT_PAREN #extract
12451252
| (SUBSTR | SUBSTRING) LEFT_PAREN str=valueExpression (FROM | COMMA) pos=valueExpression
12461253
((FOR | COMMA) len=valueExpression)? RIGHT_PAREN #substring
12471254
| TRIM LEFT_PAREN trimOption=(BOTH | LEADING | TRAILING)? (trimStr=valueExpression)?
@@ -1297,7 +1304,7 @@ constant
12971304
;
12981305

12991306
namedParameterMarker
1300-
: COLON identifier
1307+
: COLON simpleIdentifier
13011308
;
13021309
comparisonOperator
13031310
: EQ | NEQ | NEQJ | LT | LTE | GT | GTE | NSEQ
@@ -1599,13 +1606,32 @@ identifier
15991606
| {!SQL_standard_keyword_behavior}? strictNonReserved
16001607
;
16011608

1609+
// simpleIdentifier: like identifier but without IDENTIFIER('literal') support
1610+
// Use this for contexts where IDENTIFIER() syntax is not appropriate:
1611+
// - Named parameters (:param_name)
1612+
// - Extract field names (EXTRACT(field FROM ...))
1613+
// - Other keyword-like or string-like uses
1614+
simpleIdentifier
1615+
: simpleStrictIdentifier
1616+
| {!SQL_standard_keyword_behavior}? strictNonReserved
1617+
;
1618+
16021619
strictIdentifier
16031620
: IDENTIFIER #unquotedIdentifier
16041621
| quotedIdentifier #quotedIdentifierAlternative
1622+
| {!legacy_identifier_clause_only}? IDENTIFIER_KW LEFT_PAREN stringLit RIGHT_PAREN #identifierLiteral
16051623
| {SQL_standard_keyword_behavior}? ansiNonReserved #unquotedIdentifier
16061624
| {!SQL_standard_keyword_behavior}? nonReserved #unquotedIdentifier
16071625
;
16081626

1627+
// simpleStrictIdentifier: like strictIdentifier but without IDENTIFIER('literal') support
1628+
simpleStrictIdentifier
1629+
: IDENTIFIER #simpleUnquotedIdentifier
1630+
| quotedIdentifier #simpleQuotedIdentifierAlternative
1631+
| {SQL_standard_keyword_behavior}? ansiNonReserved #simpleUnquotedIdentifier
1632+
| {!SQL_standard_keyword_behavior}? nonReserved #simpleUnquotedIdentifier
1633+
;
1634+
16091635
quotedIdentifier
16101636
: BACKQUOTED_IDENTIFIER
16111637
| {double_quoted_identifiers}? DOUBLEQUOTED_STRING

sql/api/src/main/scala/org/apache/spark/sql/catalyst/parser/DataTypeAstBuilder.scala

Lines changed: 124 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -20,15 +20,15 @@ import java.util.Locale
2020

2121
import scala.jdk.CollectionConverters._
2222

23-
import org.antlr.v4.runtime.Token
23+
import org.antlr.v4.runtime.{ParserRuleContext, Token}
2424
import org.antlr.v4.runtime.tree.ParseTree
2525

2626
import org.apache.spark.SparkException
2727
import org.apache.spark.sql.catalyst.parser.SqlBaseParser._
2828
import org.apache.spark.sql.catalyst.util.CollationFactory
2929
import org.apache.spark.sql.catalyst.util.SparkParserUtils.{string, withOrigin}
3030
import org.apache.spark.sql.connector.catalog.IdentityColumnSpec
31-
import org.apache.spark.sql.errors.QueryParsingErrors
31+
import org.apache.spark.sql.errors.{DataTypeErrorsBase, QueryParsingErrors}
3232
import org.apache.spark.sql.internal.SqlApiConf
3333
import org.apache.spark.sql.types.{ArrayType, BinaryType, BooleanType, ByteType, CalendarIntervalType, CharType, DataType, DateType, DayTimeIntervalType, DecimalType, DoubleType, FloatType, GeographyType, GeometryType, IntegerType, LongType, MapType, MetadataBuilder, NullType, ShortType, StringType, StructField, StructType, TimestampNTZType, TimestampType, TimeType, VarcharType, VariantType, YearMonthIntervalType}
3434

@@ -60,12 +60,52 @@ import org.apache.spark.sql.types.{ArrayType, BinaryType, BooleanType, ByteType,
6060
*
6161
* @see
6262
* [[org.apache.spark.sql.catalyst.parser.AstBuilder]] for the full SQL statement parser
63+
*
64+
* ==CRITICAL: Extracting Identifier Names==
65+
*
66+
* When extracting identifier names from parser contexts, you MUST use the helper methods provided
67+
* by this class instead of calling ctx.getText() directly:
68+
*
69+
* - '''getIdentifierText(ctx)''': For single identifiers (column names, aliases, window names)
70+
* - '''getIdentifierParts(ctx)''': For qualified identifiers (table names, schema.table)
71+
*
72+
* '''DO NOT use ctx.getText() or ctx.identifier.getText()''' directly! These methods do not
73+
* handle the IDENTIFIER('literal') syntax and will cause incorrect behavior.
74+
*
75+
* The IDENTIFIER('literal') syntax allows string literals to be used as identifiers at parse time
76+
* (e.g., IDENTIFIER('my_col') resolves to the identifier my_col). If you use getText(), you'll
77+
* get the raw text "IDENTIFIER('my_col')" instead of "my_col", breaking the feature.
78+
*
79+
* Example:
80+
* {{{
81+
* // WRONG - does not handle IDENTIFIER('literal'):
82+
* val name = ctx.identifier.getText
83+
* SubqueryAlias(ctx.name.getText, plan)
84+
*
85+
* // CORRECT - handles both regular identifiers and IDENTIFIER('literal'):
86+
* val name = getIdentifierText(ctx.identifier)
87+
* SubqueryAlias(getIdentifierText(ctx.name), plan)
88+
* }}}
6389
*/
64-
class DataTypeAstBuilder extends SqlBaseParserBaseVisitor[AnyRef] {
90+
class DataTypeAstBuilder extends SqlBaseParserBaseVisitor[AnyRef] with DataTypeErrorsBase {
6591
protected def typedVisit[T](ctx: ParseTree): T = {
6692
ctx.accept(this).asInstanceOf[T]
6793
}
6894

95+
/**
96+
* Public helper to extract identifier parts from a context. This is exposed as public to allow
97+
* utility classes like ParserUtils to reuse the identifier resolution logic without duplicating
98+
* code.
99+
*
100+
* @param ctx
101+
* The parser context containing the identifier.
102+
* @return
103+
* Sequence of identifier parts.
104+
*/
105+
def extractIdentifierParts(ctx: ParserRuleContext): Seq[String] = {
106+
getIdentifierParts(ctx)
107+
}
108+
69109
override def visitSingleDataType(ctx: SingleDataTypeContext): DataType = withOrigin(ctx) {
70110
typedVisit[DataType](ctx.dataType)
71111
}
@@ -161,11 +201,89 @@ class DataTypeAstBuilder extends SqlBaseParserBaseVisitor[AnyRef] {
161201
}
162202

163203
/**
164-
* Create a multi-part identifier.
204+
* Parse a string into a multi-part identifier. Subclasses should override this method to
205+
* provide proper multi-part identifier parsing with access to a full SQL parser.
206+
*
207+
* For example, in AstBuilder, this would parse "`catalog`.`schema`.`table`" into Seq("catalog",
208+
* "schema", "table").
209+
*
210+
* This method is only called when parsing IDENTIFIER('literal') where the literal contains a
211+
* qualified identifier (e.g., IDENTIFIER('schema.table')). Since DataTypeAstBuilder only parses
212+
* data types (not full SQL with qualified table names), this should never be called in
213+
* practice. The base implementation throws an error to catch unexpected usage.
214+
*
215+
* @param identifier
216+
* The identifier string to parse, potentially containing dots and backticks.
217+
* @return
218+
* Sequence of identifier parts.
219+
*/
220+
protected def parseMultipartIdentifier(identifier: String): Seq[String] = {
221+
throw SparkException.internalError(
222+
"parseMultipartIdentifier must be overridden by subclasses. " +
223+
s"Attempted to parse: $identifier")
224+
}
225+
226+
/**
227+
* Get the identifier parts from a context, handling both regular identifiers and
228+
* IDENTIFIER('literal'). This method is used to support identifier-lite syntax where
229+
* IDENTIFIER('string') is folded at parse time. For qualified identifiers like
230+
* IDENTIFIER('`catalog`.`schema`'), this will parse the string and return multiple parts.
231+
*
232+
* Subclasses should override this method to provide actual parsing logic.
233+
*/
234+
protected def getIdentifierParts(ctx: ParserRuleContext): Seq[String] = {
235+
ctx match {
236+
case idCtx: IdentifierContext =>
237+
// identifier can be either strictIdentifier or strictNonReserved.
238+
// Recursively process the strictIdentifier.
239+
Option(idCtx.strictIdentifier()).map(getIdentifierParts).getOrElse(Seq(ctx.getText))
240+
241+
case idLitCtx: IdentifierLiteralContext =>
242+
// For IDENTIFIER('literal') in strictIdentifier.
243+
val literalValue = string(visitStringLit(idLitCtx.stringLit()))
244+
// Parse the string to handle qualified identifiers like "`cat`.`schema`".
245+
parseMultipartIdentifier(literalValue)
246+
247+
case errCapture: ErrorCapturingIdentifierContext =>
248+
// Regular identifier with errorCapturingIdentifierExtra.
249+
// Need to recursively handle identifier which might itself be IDENTIFIER('literal').
250+
Option(errCapture.identifier())
251+
.flatMap(id => Option(id.strictIdentifier()).map(getIdentifierParts))
252+
.getOrElse(Seq(ctx.getText))
253+
254+
case _ =>
255+
// For regular identifiers, just return the text as a single part.
256+
Seq(ctx.getText)
257+
}
258+
}
259+
260+
/**
261+
* Get the text of a SINGLE identifier, handling both regular identifiers and
262+
* IDENTIFIER('literal'). This method REQUIRES that the identifier be unqualified (single part
263+
* only). If IDENTIFIER('qualified.name') is used where a single identifier is required, this
264+
* will error.
265+
*/
266+
protected def getIdentifierText(ctx: ParserRuleContext): String = {
267+
val parts = getIdentifierParts(ctx)
268+
if (parts.size > 1) {
269+
throw new ParseException(
270+
errorClass = "IDENTIFIER_TOO_MANY_NAME_PARTS",
271+
messageParameters = Map("identifier" -> toSQLId(parts), "limit" -> "1"),
272+
ctx)
273+
}
274+
parts.head
275+
}
276+
277+
/**
278+
* Create a multi-part identifier. Handles identifier-lite with qualified identifiers like
279+
* IDENTIFIER('`cat`.`schema`').table
165280
*/
166281
override def visitMultipartIdentifier(ctx: MultipartIdentifierContext): Seq[String] =
167282
withOrigin(ctx) {
168-
ctx.parts.asScala.map(_.getText).toSeq
283+
// Each part is an errorCapturingIdentifier (which wraps identifier).
284+
// getIdentifierParts recursively handles IDENTIFIER('literal') syntax through
285+
// identifier -> strictIdentifier -> identifierLiteral.
286+
ctx.parts.asScala.flatMap(getIdentifierParts).toSeq
169287
}
170288

171289
/**
@@ -351,7 +469,7 @@ class DataTypeAstBuilder extends SqlBaseParserBaseVisitor[AnyRef] {
351469
}
352470

353471
StructField(
354-
name = colName.getText,
472+
name = getIdentifierText(colName),
355473
dataType = typedVisit[DataType](ctx.dataType),
356474
nullable = NULL == null,
357475
metadata = builder.build())

sql/api/src/main/scala/org/apache/spark/sql/catalyst/parser/SubstituteParmsAstBuilder.scala

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -81,7 +81,8 @@ class SubstituteParmsAstBuilder extends SqlBaseParserBaseVisitor[AnyRef] {
8181
*/
8282
override def visitNamedParameterLiteral(ctx: NamedParameterLiteralContext): AnyRef =
8383
withOrigin(ctx) {
84-
val paramName = ctx.namedParameterMarker().identifier().getText
84+
// Named parameters use simpleIdentifier, so .getText() is correct.
85+
val paramName = ctx.namedParameterMarker().simpleIdentifier().getText
8586
namedParams += paramName
8687

8788
// Calculate the location of the entire parameter (including the colon)
@@ -117,7 +118,8 @@ class SubstituteParmsAstBuilder extends SqlBaseParserBaseVisitor[AnyRef] {
117118
*/
118119
override def visitNamedParameterMarkerRule(ctx: NamedParameterMarkerRuleContext): AnyRef =
119120
withOrigin(ctx) {
120-
val paramName = ctx.namedParameterMarker().identifier().getText
121+
// Named parameters use simpleIdentifier, so .getText() is correct.
122+
val paramName = ctx.namedParameterMarker().simpleIdentifier().getText
121123
namedParams += paramName
122124

123125
// Calculate the location of the entire parameter (including the colon)

sql/api/src/main/scala/org/apache/spark/sql/catalyst/parser/parsers.scala

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -430,7 +430,15 @@ case class UnclosedCommentProcessor(command: String, tokenStream: CommonTokenStr
430430
}
431431

432432
object DataTypeParser extends AbstractParser {
433-
override protected def astBuilder: DataTypeAstBuilder = new DataTypeAstBuilder
433+
override protected def astBuilder: DataTypeAstBuilder = new DataTypeAstBuilder {
434+
// DataTypeParser only parses data types, not full SQL.
435+
// Multi-part identifiers should not appear in IDENTIFIER() within type definitions.
436+
override protected def parseMultipartIdentifier(identifier: String): Seq[String] = {
437+
throw SparkException.internalError(
438+
"DataTypeParser does not support multi-part identifiers in IDENTIFIER(). " +
439+
s"Attempted to parse: $identifier")
440+
}
441+
}
434442
}
435443

436444
object AbstractParser extends Logging {
@@ -476,6 +484,7 @@ object AbstractParser extends Logging {
476484
parser.SQL_standard_keyword_behavior = conf.enforceReservedKeywords
477485
parser.double_quoted_identifiers = conf.doubleQuotedIdentifiers
478486
parser.parameter_substitution_enabled = !conf.legacyParameterSubstitutionConstantsOnly
487+
parser.legacy_identifier_clause_only = conf.legacyIdentifierClauseOnly
479488
}
480489

481490
/**

sql/api/src/main/scala/org/apache/spark/sql/errors/QueryParsingErrors.scala

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -477,7 +477,7 @@ private[sql] object QueryParsingErrors extends DataTypeErrorsBase {
477477
ctx)
478478
}
479479

480-
def showFunctionsUnsupportedError(identifier: String, ctx: IdentifierContext): Throwable = {
480+
def showFunctionsUnsupportedError(identifier: String, ctx: ParserRuleContext): Throwable = {
481481
new ParseException(
482482
errorClass = "INVALID_SQL_SYNTAX.SHOW_FUNCTIONS_INVALID_SCOPE",
483483
messageParameters = Map("scope" -> toSQLId(identifier)),

sql/api/src/main/scala/org/apache/spark/sql/internal/SqlApiConf.scala

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,7 @@ private[sql] trait SqlApiConf {
5151
def parserDfaCacheFlushThreshold: Int
5252
def parserDfaCacheFlushRatio: Double
5353
def legacyParameterSubstitutionConstantsOnly: Boolean
54+
def legacyIdentifierClauseOnly: Boolean
5455
}
5556

5657
private[sql] object SqlApiConf {
@@ -106,4 +107,5 @@ private[sql] object DefaultSqlApiConf extends SqlApiConf {
106107
override def parserDfaCacheFlushThreshold: Int = -1
107108
override def parserDfaCacheFlushRatio: Double = -1.0
108109
override def legacyParameterSubstitutionConstantsOnly: Boolean = false
110+
override def legacyIdentifierClauseOnly: Boolean = false
109111
}

0 commit comments

Comments
 (0)