pmd · oowekyala · Apr 7, 2024 · Apr 8, 2024 · Apr 8, 2024 · Apr 8, 2024
diff --git a/pmd-core/src/main/java/net/sourceforge/pmd/cpd/impl/AntlrCpdLexer.java b/pmd-core/src/main/java/net/sourceforge/pmd/cpd/impl/AntlrCpdLexer.java
@@ -12,6 +12,7 @@
 
 import net.sourceforge.pmd.cpd.CpdLexer;
 import net.sourceforge.pmd.lang.TokenManager;
+import net.sourceforge.pmd.lang.ast.impl.antlr4.AntlrLexerBehavior;
 import net.sourceforge.pmd.lang.ast.impl.antlr4.AntlrToken;
 import net.sourceforge.pmd.lang.ast.impl.antlr4.AntlrTokenManager;
 import net.sourceforge.pmd.lang.document.TextDocument;
@@ -23,7 +24,15 @@ public abstract class AntlrCpdLexer extends CpdLexerBase<AntlrToken> {
  @Override
  protected final TokenManager<AntlrToken> makeLexerImpl(TextDocument doc) throws IOException {
  CharStream charStream = CharStreams.fromReader(doc.newReader(), doc.getFileId().getAbsolutePath());
- return new AntlrTokenManager(getLexerForSource(charStream), doc);
+ return new AntlrTokenManager(getLexerForSource(charStream), doc, getLexerBehavior());
+ }
+
+ /**
+ * Override this method to customize some aspects of the
+ * lexer.
+ */
+ protected AntlrLexerBehavior getLexerBehavior() {
+ return new AntlrLexerBehavior();
  }
 
  protected abstract Lexer getLexerForSource(CharStream charStream);

diff --git a/pmd-core/src/main/java/net/sourceforge/pmd/lang/ast/impl/antlr4/AntlrLexerBehavior.java b/pmd-core/src/main/java/net/sourceforge/pmd/lang/ast/impl/antlr4/AntlrLexerBehavior.java
@@ -0,0 +1,32 @@
+/**
+ * BSD-style license; for more info see http://pmd.sourceforge.net/license.html
+ */
+
+package net.sourceforge.pmd.lang.ast.impl.antlr4;
+
+import org.antlr.v4.runtime.Token;
+
+import net.sourceforge.pmd.cpd.CpdLanguageProperties;
+
+/**
+ * Strategy to customize some aspects of the mapping
+ * from Antlr tokens to PMD/CPD tokens.
+ */
+public class AntlrLexerBehavior {
+
+
+ /**
+ * Return the image that the token should have, possibly applying a transformation.
+ * The default just returns {@link Token#getText()}.
+ * Transformations here are usually normalizations, for instance, mapping
+ * the image of all keywords to uppercase/lowercase to implement case-insensitivity,
+ * or replacing the image of literals by a placeholder to implement {@link CpdLanguageProperties#CPD_ANONYMIZE_LITERALS}.
+ *
+ * @param token A token from the Antlr Lexer
+ *
+ * @return The image
+ */
+ protected String getTokenImage(Token token) {
+ return token.getText();
+ }
+}
diff --git a/pmd-core/src/main/java/net/sourceforge/pmd/lang/ast/impl/antlr4/AntlrToken.java b/pmd-core/src/main/java/net/sourceforge/pmd/lang/ast/impl/antlr4/AntlrToken.java
@@ -17,9 +17,13 @@
  */
 public class AntlrToken implements GenericToken<AntlrToken> {
 
- private final Token token;
  private final AntlrToken previousComment;
  private final TextDocument textDoc;
+ private final String image;
+ private final int endOffset;
+ private final int startOffset;
+ private final int channel;
+ private final int kind;
  AntlrToken next;
 
 
@@ -30,10 +34,28 @@ public class AntlrToken implements GenericToken<AntlrToken> {
  * @param previousComment The previous comment
  * @param textDoc The text document
  */
+ AntlrToken(final Token token, final AntlrToken previousComment, TextDocument textDoc, AntlrLexerBehavior behavior) {
+ this.previousComment = previousComment;
+ this.textDoc = textDoc;
+ this.image = behavior.getTokenImage(token);
+ this.startOffset = token.getStartIndex();
+ this.endOffset = token.getStopIndex() + 1; // exclusive
+ this.channel = token.getChannel();
+ this.kind = token.getType();
+ }
+
+ /**
+ * @deprecated Don't create antlr tokens directly, use an {@link AntlrTokenManager}
+ */
+ @Deprecated
  public AntlrToken(final Token token, final AntlrToken previousComment, TextDocument textDoc) {
- this.token = token;
  this.previousComment = previousComment;
  this.textDoc = textDoc;
+ this.image = token.getText();
+ this.startOffset = token.getStartIndex();
+ this.endOffset = token.getStopIndex() + 1; // exclusive
+ this.channel = token.getChannel();
+ this.kind = token.getType();
  }
 
  @Override
@@ -48,13 +70,13 @@ public AntlrToken getPreviousComment() {
 
  @Override
  public CharSequence getImageCs() {
- return token.getText();
+ return image;
  }
 
  /** Returns a text region with the coordinates of this token. */
  @Override
  public TextRegion getRegion() {
- return TextRegion.fromBothOffsets(token.getStartIndex(), token.getStopIndex() + 1);
+ return TextRegion.fromBothOffsets(startOffset, endOffset);
  }
 
  @Override
@@ -74,14 +96,14 @@ public int compareTo(AntlrToken o) {
 
  @Override
  public int getKind() {
- return token.getType();
+ return kind;
  }
 
  public boolean isHidden() {
  return !isDefault();
  }
 
  public boolean isDefault() {
- return token.getChannel() == Lexer.DEFAULT_TOKEN_CHANNEL;
+ return channel == Lexer.DEFAULT_TOKEN_CHANNEL;
  }
 }
diff --git a/pmd-core/src/main/java/net/sourceforge/pmd/lang/ast/impl/antlr4/AntlrTokenManager.java b/pmd-core/src/main/java/net/sourceforge/pmd/lang/ast/impl/antlr4/AntlrTokenManager.java
@@ -20,12 +20,20 @@ public class AntlrTokenManager implements TokenManager<AntlrToken> {
 
  private final Lexer lexer;
  private final TextDocument textDoc;
+ private final AntlrLexerBehavior behavior;
  private AntlrToken previousToken;
 
 
  public AntlrTokenManager(final Lexer lexer, final TextDocument textDocument) {
+ this(lexer, textDocument, new AntlrLexerBehavior());
+ }
+
+ public AntlrTokenManager(final Lexer lexer,
+ final TextDocument textDocument,
+ final AntlrLexerBehavior behavior) {
  this.lexer = lexer;
  this.textDoc = textDocument;
+ this.behavior = behavior;
  resetListeners();
  }
 
@@ -40,7 +48,7 @@ public AntlrToken getNextToken() {
 
  private AntlrToken getNextTokenFromAnyChannel() {
  final AntlrToken previousComment = previousToken != null && previousToken.isHidden() ? previousToken : null;
- final AntlrToken currentToken = new AntlrToken(lexer.nextToken(), previousComment, textDoc);
+ final AntlrToken currentToken = new AntlrToken(lexer.nextToken(), previousComment, textDoc, this.behavior);
  if (previousToken != null) {
  previousToken.next = currentToken;
  }

diff --git a/pmd-plsql/etc/grammar/PLSQL.jjt b/pmd-plsql/etc/grammar/PLSQL.jjt
@@ -5239,7 +5239,7 @@ TOKEN :
  ( "\"" <LETTER> ( <LETTER> | <DIGIT> | "$" | "_" | "#" )* "\"" )
 >
 |
-< LEXICAL_PARAMETER:
+< #LEXICAL_PARAMETER:
  (
  ("&&" | "&")
  (
@@ -5263,12 +5263,6 @@ TOKEN :
 |
 < QUOTED_LITERAL: "\"" (<_WHATEVER_CHARACTER_WO_QUOTE> | <SPECIAL_CHARACTERS> | "\\\"")* "\"" >
 |
-< SQLDATA_CLASS: "SQLData" >
-|
-< CUSTOMDATUM_CLASS: "CustomDatum" >
-|
-< ORADATA_CLASS: "OraData" >
-|
 < JAVA_INTERFACE_CLASS: ( "SQLData" | "CustomDatum" | "OraData" ) >
 //|
 //< #BOOLEAN_LITERAL: "TRUE" | "FALSE" >
@@ -6677,7 +6671,7 @@ ASTID ID(): {}
  //20120427 | <OID>
  //20120428 | <AGGREGATE>
  //| <SYS_REFCURSOR>
- | <JAVA_INTERFACE_CLASS> | <SQLDATA_CLASS> | <CUSTOMDATUM_CLASS> | <ORADATA_CLASS>
+ | <JAVA_INTERFACE_CLASS>
  //20120427 | <EXTERNAL>
  //SRT 20090608 ALTER TYPE key words
  //| <ADD>
@@ -6967,15 +6961,15 @@ ASTTypeKeyword TypeKeyword(): {}
  <TIMEZONE_REGION> | <TIMEZONE_ABBR> | <TIMEZONE_MINUTE> | <TIMEZONE_HOUR> | <DOUBLE> | <PRECISION> |
  <VARRAY> |
  <YEAR> | <LOCAL> | <WITH> | <ZONE>
- | <JAVA_INTERFACE_CLASS> | <SQLDATA_CLASS> | <CUSTOMDATUM_CLASS> | <ORADATA_CLASS>
+ | <JAVA_INTERFACE_CLASS>
  )
  { jjtThis.setImage(token.getImage()) ; jjtThis.value = token ; return jjtThis ; }
 }
 
 ASTJavaInterfaceClass JavaInterfaceClass(): {}
 {
  (
- <SQLDATA_CLASS> | <CUSTOMDATUM_CLASS> | <ORADATA_CLASS>
+ <JAVA_INTERFACE_CLASS>
  )
  { jjtThis.setImage(token.getImage()) ; jjtThis.value = token ; return jjtThis ; }
 }
diff --git a/pmd-plsql/src/main/java/net/sourceforge/pmd/lang/plsql/ast/PLSQLParser.java b/pmd-plsql/src/main/java/net/sourceforge/pmd/lang/plsql/ast/PLSQLParser.java
@@ -4,16 +4,53 @@
 
 package net.sourceforge.pmd.lang.plsql.ast;
 
+import org.checkerframework.checker.nullness.qual.Nullable;
+
 import net.sourceforge.pmd.benchmark.TimeTracker;
 import net.sourceforge.pmd.lang.ast.ParseException;
 import net.sourceforge.pmd.lang.ast.impl.javacc.CharStream;
+import net.sourceforge.pmd.lang.ast.impl.javacc.JavaccToken;
+import net.sourceforge.pmd.lang.ast.impl.javacc.JavaccTokenDocument;
 import net.sourceforge.pmd.lang.ast.impl.javacc.JavaccTokenDocument.TokenDocumentBehavior;
 import net.sourceforge.pmd.lang.ast.impl.javacc.JjtreeParserAdapter;
 import net.sourceforge.pmd.lang.plsql.symboltable.SymbolFacade;
 
 public class PLSQLParser extends JjtreeParserAdapter<ASTInput> {
 
- private static final TokenDocumentBehavior TOKEN_BEHAVIOR = new TokenDocumentBehavior(PLSQLTokenKinds.TOKEN_NAMES);
+ // Stores images of constant string literals.
+ // This is to reuse the image strings for PLSQL keywords.
+ // JavaCC unfortunately does not store a constant image for those
+ // keywords because the grammar is case-insensitive.
+ // This optimization has the effect that the image of keyword tokens
+ // is always upper-case, regardless of the actual case used in the code.
+ // The original casing can be found by looking at the TextDocument for the file.
+
+ // NOTE: the size of this array should be greater than the number of tokens in the file.
+ private static final String[] STRING_LITERAL_IMAGES_EXTRA = new String[512];
+
+ static {
+ int i = 0;
+ String image = PLSQLTokenKinds.describe(i);
+ while (image != null && i < STRING_LITERAL_IMAGES_EXTRA.length) {
+ if (image.startsWith("\"") && image.endsWith("\"")) {
+ // a string literal image, remove the quotes
+ image = image.substring(1, image.length() - 1);
+ STRING_LITERAL_IMAGES_EXTRA[i] = image;
+ }
+ i++;
+ }
+ }
+
+ private static final TokenDocumentBehavior TOKEN_BEHAVIOR = new TokenDocumentBehavior(PLSQLTokenKinds.TOKEN_NAMES) {
+ @Override
+ public JavaccToken createToken(JavaccTokenDocument self, int kind, CharStream cs, @Nullable String image) {
+ if (image == null) {
+ // fetch another constant image if possible.
+ image = STRING_LITERAL_IMAGES_EXTRA[kind];
+ }
+ return super.createToken(self, kind, cs, image);
+ }
+ };
 
  @Override
  protected TokenDocumentBehavior tokenBehavior() {

diff --git a/pmd-plsql/src/main/java/net/sourceforge/pmd/lang/plsql/cpd/PLSQLCpdLexer.java b/pmd-plsql/src/main/java/net/sourceforge/pmd/lang/plsql/cpd/PLSQLCpdLexer.java
@@ -4,6 +4,8 @@
 
 package net.sourceforge.pmd.lang.plsql.cpd;
 
+import java.util.Locale;
+
 import net.sourceforge.pmd.cpd.CpdLanguageProperties;
 import net.sourceforge.pmd.cpd.impl.JavaccCpdLexer;
 import net.sourceforge.pmd.lang.LanguagePropertyBundle;
@@ -37,16 +39,25 @@ protected String getImage(JavaccToken plsqlToken) {
  String image = plsqlToken.getImage();
 
  if (ignoreIdentifiers && plsqlToken.kind == PLSQLTokenKinds.IDENTIFIER) {
- image = String.valueOf(plsqlToken.kind);
- }
-
- if (ignoreLiterals && (plsqlToken.kind == PLSQLTokenKinds.UNSIGNED_NUMERIC_LITERAL
+ image = "<identifier>";
+ } else if (ignoreLiterals && (plsqlToken.kind == PLSQLTokenKinds.UNSIGNED_NUMERIC_LITERAL
  || plsqlToken.kind == PLSQLTokenKinds.FLOAT_LITERAL
  || plsqlToken.kind == PLSQLTokenKinds.INTEGER_LITERAL
  || plsqlToken.kind == PLSQLTokenKinds.CHARACTER_LITERAL
  || plsqlToken.kind == PLSQLTokenKinds.STRING_LITERAL
  || plsqlToken.kind == PLSQLTokenKinds.QUOTED_LITERAL)) {
- image = String.valueOf(plsqlToken.kind);
+ // the token kind is preserved
+ image = PLSQLTokenKinds.describe(plsqlToken.kind);
+ } else if (plsqlToken.kind != PLSQLTokenKinds.CHARACTER_LITERAL
+ && plsqlToken.kind != PLSQLTokenKinds.STRING_LITERAL
+ && plsqlToken.kind != PLSQLTokenKinds.QUOTED_LITERAL) {
+ // PLSQL is case-insensitive, but the contents of
+ // string literals and the like are case-sensitive.
+ // Note: tokens are normalized to uppercase make CPD case-insensitive.
+ // We use uppercase and not lowercase because that way, PLSQL keywords
+ // will be returned unchanged (they are already uppercase, see PLSQLParser),
+ // therefore creating fewer strings in memory.
+ image = image.toUpperCase(Locale.ROOT);
  }
  return image;
  }

diff --git a/pmd-plsql/src/test/java/net/sourceforge/pmd/lang/plsql/cpd/PLSQLCpdLexerTest.java b/pmd-plsql/src/test/java/net/sourceforge/pmd/lang/plsql/cpd/PLSQLCpdLexerTest.java
@@ -6,8 +6,10 @@
 
 import org.junit.jupiter.api.Test;
 
+import net.sourceforge.pmd.cpd.CpdLanguageProperties;
 import net.sourceforge.pmd.lang.plsql.PLSQLLanguageModule;
 import net.sourceforge.pmd.lang.test.cpd.CpdTextComparisonTest;
+import net.sourceforge.pmd.lang.test.cpd.LanguagePropertyConfig;
 
 class PLSQLCpdLexerTest extends CpdTextComparisonTest {
 
@@ -29,4 +31,15 @@ void testSpecialComments() {
  void testTabWidth() {
  doTest("tabWidth");
  }
+
+ @Test
+ void anonymizeLiterals() {
+ doTest("sample-plsql", "_ignore-literals", ignoreLiterals());
+ }
+
+ LanguagePropertyConfig ignoreLiterals() {
+ return props -> {
+ props.setProperty(CpdLanguageProperties.CPD_ANONYMIZE_LITERALS, true);
+ };
+ }
 }