SOLR-13360 StringIndexOutOfBoundsException: String index out of range…

…: -3
apache · Jan 24, 2025 · 7d45759 · 7d45759
1 parent b3b20d6
commit 7d45759
Show file tree

Hide file tree

Showing 4 changed files with 364 additions and 40 deletions.
diff --git a/solr/core/src/java/org/apache/solr/spelling/SpellCheckCollator.java b/solr/core/src/java/org/apache/solr/spelling/SpellCheckCollator.java
@@ -23,6 +23,8 @@
 import java.util.Arrays;
 import java.util.Iterator;
 import java.util.List;
+import java.util.Map;
+import java.util.Map.Entry;
 import org.apache.lucene.index.IndexReader;
 import org.apache.solr.common.params.CommonParams;
 import org.apache.solr.common.params.CursorMarkParams;
@@ -212,61 +214,125 @@ public List<SpellCheckCollation> collate(
     return collations;
   }
 
-  private String getCollation(String origQuery, List<SpellCheckCorrection> corrections) {
+  static String getCollation(String origQuery, List<SpellCheckCorrection> correctionsAll) {
+    List<SpellCheckCorrection> corrections = mergeSpellCheckCorrections(correctionsAll);
     StringBuilder collation = new StringBuilder(origQuery);
     int offset = 0;
-    String corr = "";
-    for (int i = 0; i < corrections.size(); i++) {
-      SpellCheckCorrection correction = corrections.get(i);
+    for (SpellCheckCorrection correction : corrections) {
       Token tok = correction.getOriginal();
+
       // we are replacing the query in order, but injected terms might cause
       // illegal offsets due to previous replacements.
       if (tok.getPositionIncrement() == 0) continue;
-      corr = correction.getCorrection();
-      boolean addParenthesis = false;
-      Character requiredOrProhibited = null;
-      int indexOfSpace = corr.indexOf(' ');
-      StringBuilder corrSb = new StringBuilder(corr);
-      int bump = 1;
-
-      // If the correction contains whitespace (because it involved breaking a word in 2+ words),
-      // then be sure all of the new words have the same optional/required/prohibited status in the
-      // query.
-      while (indexOfSpace > -1 && indexOfSpace < corr.length() - 1) {
-        char previousChar = tok.startOffset() > 0 ? origQuery.charAt(tok.startOffset() - 1) : ' ';
-        if (previousChar == '-' || previousChar == '+') {
-          corrSb.insert(indexOfSpace + bump, previousChar);
-          if (requiredOrProhibited == null) {
-            requiredOrProhibited = previousChar;
-          }
-          bump++;
-        } else if ((tok.getFlags() & QueryConverter.TERM_IN_BOOLEAN_QUERY_FLAG)
-            == QueryConverter.TERM_IN_BOOLEAN_QUERY_FLAG) {
-          addParenthesis = true;
-          corrSb.insert(indexOfSpace + bump, "AND ");
-          bump += 4;
-        }
-        indexOfSpace = correction.getCorrection().indexOf(' ', indexOfSpace + bump);
-      }
 
-      int oneForReqOrProhib = 0;
-      if (addParenthesis) {
-        if (requiredOrProhibited != null) {
-          corrSb.insert(0, requiredOrProhibited);
-          oneForReqOrProhib++;
-        }
-        corrSb.insert(0, '(');
-        corrSb.append(')');
-      }
-      corr = corrSb.toString();
+      Entry<String, Integer> whitespaceCorrection =
+          whitespaceCorrection(origQuery, correction.getCorrection(), tok);
+      String corr = whitespaceCorrection.getKey();
+      int oneForReqOrProhib = whitespaceCorrection.getValue();
+
       int startIndex = tok.startOffset() + offset - oneForReqOrProhib;
+
+      if (startIndex < 0) { // avoiding StringIndexOutOfBoundsException see SOLR-13360
+        logSIOOBEDebugData(origQuery, correctionsAll);
+        // is anything useful in the current collation or just give up?
+        // TODO return origQuery;
+      }
+
       int endIndex = tok.endOffset() + offset;
+
       collation.replace(startIndex, endIndex, corr);
       offset += corr.length() - oneForReqOrProhib - (tok.endOffset() - tok.startOffset());
     }
     return collation.toString();
   }
 
+  static Entry<String, Integer> whitespaceCorrection(
+      final String origQuery, final String correction, Token tok) {
+    String corr = correction;
+    boolean addParenthesis = false;
+    Character requiredOrProhibited = null;
+    int indexOfSpace = corr.indexOf(' ');
+    StringBuilder corrSb = new StringBuilder(corr);
+    int bump = 1;
+
+    // If the correction contains whitespace (because it involved breaking a word in 2+ words),
+    // then be sure all of the new words have the same optional/required/prohibited status in the
+    // query.
+    while (indexOfSpace > -1 && indexOfSpace < corr.length() - 1) {
+      char previousChar = tok.startOffset() > 0 ? origQuery.charAt(tok.startOffset() - 1) : ' ';
+      if (previousChar == '-' || previousChar == '+') {
+        corrSb.insert(indexOfSpace + bump, previousChar);
+        if (requiredOrProhibited == null) {
+          requiredOrProhibited = previousChar;
+        }
+        bump++;
+      } else if ((tok.getFlags() & QueryConverter.TERM_IN_BOOLEAN_QUERY_FLAG)
+          == QueryConverter.TERM_IN_BOOLEAN_QUERY_FLAG) {
+        addParenthesis = true;
+        corrSb.insert(indexOfSpace + bump, "AND ");
+        bump += 4;
+      }
+      indexOfSpace = correction.indexOf(' ', indexOfSpace + bump);
+    }
+
+    int oneForReqOrProhib = 0;
+    if (addParenthesis) {
+      if (requiredOrProhibited != null) {
+        corrSb.insert(0, requiredOrProhibited);
+        oneForReqOrProhib++;
+      }
+      corrSb.insert(0, '(');
+      corrSb.append(')');
+    }
+    corr = corrSb.toString();
+    return Map.entry(corr, oneForReqOrProhib);
+  }
+
+  private static volatile boolean mergeSpellCheckCorrections = true; // TODO system property
+
+  static List<SpellCheckCorrection> mergeSpellCheckCorrections(
+      List<SpellCheckCorrection> corrections) {
+    if (!mergeSpellCheckCorrections) {
+      return corrections;
+    }
+
+    // TODO fix overlapping tokan intervals
+    // - removing all entries with getPositionIncrement=0
+    // - sorting by token.startOffset
+    // - remove overlapping [startOffset, endOffset] intervals
+
+    return corrections;
+  }
+
+  private static volatile boolean logSIOOBEDebugData = true;
+
+  static void logSIOOBEDebugData(String origQuery, List<SpellCheckCorrection> corrections) {
+    if (!logSIOOBEDebugData) {
+      return;
+    }
+    logSIOOBEDebugData = false;
+    StringBuilder info = new StringBuilder(origQuery);
+    info.append("[");
+    for (SpellCheckCorrection correction : corrections) {
+      Token tok = correction.getOriginal();
+      info.append(
+          "('"
+              + tok
+              + "', "
+              + tok.startOffset()
+              + ", "
+              + tok.endOffset()
+              + ", "
+              + tok.getPositionIncrement()
+              + ", '"
+              + correction.getOriginal()
+              + "'),");
+    }
+    info.append("]");
+    info.append(", mergeSpellCheckCorrections=" + mergeSpellCheckCorrections);
+    log.warn("logging SIOOBE debug data, please report to SOLR-13360 {}", info);
+  }
+
   public SpellCheckCollator setMaxCollations(int maxCollations) {
     this.maxCollations = maxCollations;
     return this;

diff --git a/solr/core/src/test-files/solr/collection1/conf/solrconfig-collapseqparser.xml b/solr/core/src/test-files/solr/collection1/conf/solrconfig-collapseqparser.xml
@@ -177,6 +177,13 @@
       <str name="field">a_s</str>
       <int name="minQueryLength">3</int>
     </lst>
+
+    <lst name="spellchecker">
+      <str name="name">direct_spelltest_t</str>
+      <str name="classname">solr.DirectSolrSpellChecker</str>
+      <str name="field">spelltest_t</str>
+      <int name="minQueryLength">3</int>
+    </lst>
   </searchComponent>
 
   <!--

diff --git a/solr/core/src/test/org/apache/solr/spelling/SpellCheckCollatorCollationOnlyTest.java b/solr/core/src/test/org/apache/solr/spelling/SpellCheckCollatorCollationOnlyTest.java
@@ -0,0 +1,136 @@
+package org.apache.solr.spelling;
+
+import java.util.List;
+import org.apache.solr.SolrTestCaseJ4;
+import org.junit.Test;
+
+public class SpellCheckCollatorCollationOnlyTest extends SolrTestCaseJ4 {
+
+  // Tests SOLR-13360 using some manual correction scenario based on various observations recorded
+  // in the ticket.
+  //
+  // Notes
+  // - tokens with (tok.getPositionIncrement() == 0) will be filtered out.
+  // - replacing longer tokens with shorter ones can make the offset go negative, which seems ok as
+  // long as the startOffset is always in (strict) increasing order
+
+  @Test
+  public void testCollationGeneration1() {
+    // sample from synonyms.txt:
+    // # Synonyms used in semantic expansion
+    // tabby => tabby, cat, feline, animal
+    // persian => persian, cat, feline, animal
+
+    String origQuery = "cats persians tabby";
+    List<SpellCheckCorrection> corrections =
+        List.of(
+            // ref 1st token
+            correction("cats", 0, 4, 1, "cat"), // original token. offset goes to -1
+
+            // ref 2nd token
+            correction("persian", 5, 13, 1, "persian"), // original token. offset goes to -2
+
+            // ref 3rd token
+            correction("tabbi", 14, 19, 1, "tabbi"), // original token
+            correction("cat", 14, 19, 0, "cat"), // synonym
+            correction("felin", 14, 19, 0, "felin"), // synonym
+            correction("anim", 14, 19, 0, "anim") // synonym
+            );
+
+    String collation = SpellCheckCollator.getCollation(origQuery, corrections);
+    String expected = "cat persian tabbi";
+    assertEquals("Incorrect collation: " + collation, expected, collation);
+  }
+
+  @Test
+  public void testCollationGeneration1Shuffle() {
+    // same as testCollationGeneration1 but I am manually shuffling the tokens
+
+    String origQuery = "cats persians tabby";
+    List<SpellCheckCorrection> corrections =
+        List.of(
+            // ref 2nd token
+            correction("persian", 5, 13, 1, "persian"), // original token. offset goes to -2
+
+            // ref 1st token
+            correction("cats", 0, 4, 1, "cat"), // original token. offset goes to -1
+
+            // ref 3rd token
+            correction("tabbi", 14, 19, 1, "tabbi"), // original token
+            correction("cat", 14, 19, 0, "cat"), // synonym
+            correction("felin", 14, 19, 0, "felin"), // synonym
+            correction("anim", 14, 19, 0, "anim") // synonym
+            );
+
+    String collation = SpellCheckCollator.getCollation(origQuery, corrections);
+    String expected = "cat persian tabbi";
+    assertEquals("Incorrect collation: " + collation, expected, collation);
+  }
+
+  @Test
+  public void testCollationGeneration1Repeat() {
+    // same as testCollationGeneration1 but I am manually repeating one of the tokens
+
+    String origQuery = "cats persians tabby";
+    List<SpellCheckCorrection> corrections =
+        List.of(
+            // ref 1st token
+            correction("cats", 0, 4, 1, "cat"), // original token. offset goes to -1
+
+            // ref 1st token - duplicated
+            correction("cats", 0, 4, 1, "cat"), // original token. offset goes to -1
+
+            // ref 2nd token
+            correction("persian", 5, 13, 1, "persian"), // original token. offset goes to -2
+
+            // ref 3rd token
+            correction("tabbi", 14, 19, 1, "tabbi"), // original token
+            correction("cat", 14, 19, 0, "cat"), // synonym
+            correction("felin", 14, 19, 0, "felin"), // synonym
+            correction("anim", 14, 19, 0, "anim") // synonym
+            );
+
+    String collation = SpellCheckCollator.getCollation(origQuery, corrections);
+    String expected = "cat persian tabbi";
+    assertEquals("Incorrect collation: " + collation, expected, collation);
+  }
+
+  @Test
+  public void testCollationGeneration2() {
+    // sample from synonyms.txt:
+    // panthera pardus, leopard|0.6
+    //
+    // Note. depending on the field type, this can end up as the list of tokens: [leopard, 0, 6,
+    // panthera, pardu, cats]
+
+    String origQuery = "panthera pardus cats";
+
+    List<SpellCheckCorrection> corrections =
+        List.of(
+            correction("leopard", 0, 15, 1, "leopard"),
+            correction("0", 0, 15, 1, "0"),
+            correction("6", 0, 15, 1, "6"),
+            correction("panthera", 0, 8, 0, "panthera"),
+            correction("pardu", 9, 15, 1, "pardu"),
+            correction("cats", 16, 20, 1, "cat"));
+
+    String collation = SpellCheckCollator.getCollation(origQuery, corrections);
+    String expected = "pardu cat"; // TODO what is expected here?
+    assertEquals("Incorrect collation: " + collation, expected, collation);
+  }
+
+  private static SpellCheckCorrection correction(
+      String text, int start, int end, int positionIncrement, String correction) {
+    SpellCheckCorrection spellCheckCorrection = new SpellCheckCorrection();
+    spellCheckCorrection.setOriginal(token(text, start, end, positionIncrement));
+    spellCheckCorrection.setCorrection(correction);
+    spellCheckCorrection.setNumberOfOccurences(1);
+    return spellCheckCorrection;
+  }
+
+  private static Token token(String text, int start, int end, int positionIncrement) {
+    Token token = new Token(text, start, end);
+    token.setPositionIncrement(positionIncrement);
+    return token;
+  }
+}