apache · stillalex · Jan 24, 2025
diff --git a/solr/core/src/java/org/apache/solr/spelling/PossibilityIterator.java b/solr/core/src/java/org/apache/solr/spelling/PossibilityIterator.java
@@ -386,7 +386,7 @@ public String toString() {
     }
   }
 
-  private static class StartOffsetComparator implements Comparator<SpellCheckCorrection> {
+  static class StartOffsetComparator implements Comparator<SpellCheckCorrection> {
     @Override
     public int compare(SpellCheckCorrection o1, SpellCheckCorrection o2) {
       return o1.getOriginal().startOffset() - o2.getOriginal().startOffset();

diff --git a/solr/core/src/java/org/apache/solr/spelling/SpellCheckCollator.java b/solr/core/src/java/org/apache/solr/spelling/SpellCheckCollator.java
@@ -51,6 +51,14 @@ public class SpellCheckCollator {
   private boolean suggestionsMayOverlap = false;
   private int docCollectionLimit = 0;
 
+  private static volatile boolean mergeSpellCheckCorrections =
+      Boolean.parseBoolean(
+          System.getProperty("solr.SpellCheckCollator.mergeSpellCheckCorrections", "true"));
+
+  private static volatile boolean logSIOOBEDebugData =
+      Boolean.parseBoolean(
+          System.getProperty("solr.SpellCheckCollator.logSIOOBEDebugData", "true"));
+
   public List<SpellCheckCollation> collate(
       SpellingResult result, String originalQuery, ResponseBuilder ultimateResponse) {
     List<SpellCheckCollation> collations = new ArrayList<>();
@@ -212,7 +220,8 @@ public List<SpellCheckCollation> collate(
     return collations;
   }
 
-  private String getCollation(String origQuery, List<SpellCheckCorrection> corrections) {
+  static String getCollation(String origQuery, List<SpellCheckCorrection> correctionsAll) {
+    List<SpellCheckCorrection> corrections = mergeSpellCheckCorrections(correctionsAll);
     StringBuilder collation = new StringBuilder(origQuery);
     int offset = 0;
     String corr = "";
@@ -260,13 +269,74 @@ private String getCollation(String origQuery, List<SpellCheckCorrection> correct
       }
       corr = corrSb.toString();
       int startIndex = tok.startOffset() + offset - oneForReqOrProhib;
+
+      if (startIndex < 0) { // avoiding StringIndexOutOfBoundsException see SOLR-13360
+        logSIOOBEDebugData(origQuery, correctionsAll);
+        break;
+      }
+
       int endIndex = tok.endOffset() + offset;
       collation.replace(startIndex, endIndex, corr);
       offset += corr.length() - oneForReqOrProhib - (tok.endOffset() - tok.startOffset());
     }
     return collation.toString();
   }
 
+  static List<SpellCheckCorrection> mergeSpellCheckCorrections(
+      List<SpellCheckCorrection> corrections) {
+    if (!mergeSpellCheckCorrections) {
+      return corrections;
+    }
+
+    // Note. fix overlapping token intervals
+    // - sorting by token.startOffset. unclear if this can cause some reshuffle where lower rank
+    // items with the same position can come first
+    // - remove overlapping [startOffset, endOffset] intervals
+
+    List<SpellCheckCorrection> filtered = new ArrayList<SpellCheckCorrection>(corrections);
+    filtered.sort(new PossibilityIterator.StartOffsetComparator());
+
+    int end = -1;
+    for (Iterator<SpellCheckCorrection> iterator = filtered.iterator(); iterator.hasNext(); ) {
+      SpellCheckCorrection correction = iterator.next();
+      Token t = correction.getOriginal();
+      if (t.startOffset() > end && t.endOffset() > end) {
+        end = t.endOffset();
+      } else {
+        iterator.remove();
+      }
+    }
+
+    return filtered;
+  }
+
+  static void logSIOOBEDebugData(String origQuery, List<SpellCheckCorrection> corrections) {
+    if (!logSIOOBEDebugData) {
+      return;
+    }
+    logSIOOBEDebugData = false;
+    StringBuilder info = new StringBuilder(origQuery);
+    info.append("[");
+    for (SpellCheckCorrection correction : corrections) {
+      Token tok = correction.getOriginal();
+      info.append(
+          "('"
+              + tok
+              + "', "
+              + tok.startOffset()
+              + ", "
+              + tok.endOffset()
+              + ", "
+              + tok.getPositionIncrement()
+              + ", '"
+              + correction.getOriginal()
+              + "'),");
+    }
+    info.append("]");
+    info.append(", mergeSpellCheckCorrections=" + mergeSpellCheckCorrections);
+    log.warn("logging SIOOBE debug data, please report to SOLR-13360 {}", info);
+  }
+
   public SpellCheckCollator setMaxCollations(int maxCollations) {
     this.maxCollations = maxCollations;
     return this;

diff --git a/solr/core/src/test-files/solr/collection1/conf/solrconfig-collapseqparser.xml b/solr/core/src/test-files/solr/collection1/conf/solrconfig-collapseqparser.xml
@@ -177,6 +177,13 @@
       <str name="field">a_s</str>
       <int name="minQueryLength">3</int>
     </lst>
+
+    <lst name="spellchecker">
+      <str name="name">direct_spelltest_t</str>
+      <str name="classname">solr.DirectSolrSpellChecker</str>
+      <str name="field">spelltest_t</str>
+      <int name="minQueryLength">3</int>
+    </lst>
   </searchComponent>
 
   <!--

diff --git a/solr/core/src/test/org/apache/solr/spelling/SpellCheckCollatorCollationOnlyTest.java b/solr/core/src/test/org/apache/solr/spelling/SpellCheckCollatorCollationOnlyTest.java
@@ -0,0 +1,152 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.solr.spelling;
+
+import java.util.List;
+import org.apache.solr.SolrTestCaseJ4;
+import org.junit.Test;
+
+public class SpellCheckCollatorCollationOnlyTest extends SolrTestCaseJ4 {
+
+  // Tests SOLR-13360 using some manual correction scenario based on various observations recorded
+  // in the ticket.
+  //
+  // Notes
+  // - tokens with (tok.getPositionIncrement() == 0) will be filtered out.
+  // - replacing longer tokens with shorter ones can make the offset go negative, which seems ok as
+  // long as the startOffset is always in (strict) increasing order
+
+  @Test
+  public void testCollationGeneration1() {
+    // sample from synonyms.txt:
+    // # Synonyms used in semantic expansion
+    // tabby => tabby, cat, feline, animal
+    // persian => persian, cat, feline, animal
+
+    String origQuery = "cats persians tabby";
+    List<SpellCheckCorrection> corrections =
+        List.of(
+            // ref 1st token
+            correction("cats", 0, 4, 1, "cat"), // original token. offset goes to -1
+
+            // ref 2nd token
+            correction("persian", 5, 13, 1, "persian"), // original token. offset goes to -2
+
+            // ref 3rd token
+            correction("tabbi", 14, 19, 1, "tabbi"), // original token
+            correction("cat", 14, 19, 0, "cat"), // synonym
+            correction("felin", 14, 19, 0, "felin"), // synonym
+            correction("anim", 14, 19, 0, "anim") // synonym
+            );
+
+    String collation = SpellCheckCollator.getCollation(origQuery, corrections);
+    String expected = "cat persian tabbi";
+    assertEquals("Incorrect collation: " + collation, expected, collation);
+  }
+
+  @Test
+  public void testCollationGeneration1Shuffle() {
+    // same as testCollationGeneration1 but I am manually shuffling the tokens
+
+    String origQuery = "cats persians tabby";
+    List<SpellCheckCorrection> corrections =
+        List.of(
+            // ref 2nd token
+            correction("persian", 5, 13, 1, "persian"), // original token. offset goes to -2
+
+            // ref 1st token
+            correction("cats", 0, 4, 1, "cat"), // original token. offset goes to -1
+
+            // ref 3rd token
+            correction("tabbi", 14, 19, 1, "tabbi"), // original token
+            correction("cat", 14, 19, 0, "cat"), // synonym
+            correction("felin", 14, 19, 0, "felin"), // synonym
+            correction("anim", 14, 19, 0, "anim") // synonym
+            );
+
+    String collation = SpellCheckCollator.getCollation(origQuery, corrections);
+    String expected = "cat persian tabbi";
+    assertEquals("Incorrect collation: " + collation, expected, collation);
+  }
+
+  @Test
+  public void testCollationGeneration1Repeat() {
+    // same as testCollationGeneration1 but I am manually repeating one of the tokens
+
+    String origQuery = "cats persians tabby";
+    List<SpellCheckCorrection> corrections =
+        List.of(
+            // ref 1st token
+            correction("cats", 0, 4, 1, "cat"), // original token. offset goes to -1
+
+            // ref 1st token - duplicated
+            correction("cats", 0, 4, 1, "cat"), // original token. offset goes to -1
+
+            // ref 2nd token
+            correction("persian", 5, 13, 1, "persian"), // original token. offset goes to -2
+
+            // ref 3rd token
+            correction("tabbi", 14, 19, 1, "tabbi"), // original token
+            correction("cat", 14, 19, 0, "cat"), // synonym
+            correction("felin", 14, 19, 0, "felin"), // synonym
+            correction("anim", 14, 19, 0, "anim") // synonym
+            );
+
+    String collation = SpellCheckCollator.getCollation(origQuery, corrections);
+    String expected = "cat persian tabbi";
+    assertEquals("Incorrect collation: " + collation, expected, collation);
+  }
+
+  @Test
+  public void testCollationGeneration2() {
+    // sample from synonyms.txt:
+    // panthera pardus, leopard|0.6
+    //
+    // Note. depending on the field type, this can end up as the list of tokens: [leopard, 0, 6,
+    // panthera, pardu, cats]
+
+    String origQuery = "panthera pardus cats";
+
+    List<SpellCheckCorrection> corrections =
+        List.of(
+            correction("leopard", 0, 15, 1, "leopard"),
+            correction("0", 0, 15, 1, "0"),
+            correction("6", 0, 15, 1, "6"),
+            correction("panthera", 0, 8, 0, "panthera"),
+            correction("pardu", 9, 15, 1, "pardu"),
+            correction("cats", 16, 20, 1, "cat"));
+
+    String collation = SpellCheckCollator.getCollation(origQuery, corrections);
+    String expected = "leopard cat";
+    assertEquals("Incorrect collation: " + collation, expected, collation);
+  }
+
+  private static SpellCheckCorrection correction(
+      String text, int start, int end, int positionIncrement, String correction) {
+    SpellCheckCorrection spellCheckCorrection = new SpellCheckCorrection();
+    spellCheckCorrection.setOriginal(token(text, start, end, positionIncrement));
+    spellCheckCorrection.setCorrection(correction);
+    spellCheckCorrection.setNumberOfOccurences(1);
+    return spellCheckCorrection;
+  }
+
+  private static Token token(String text, int start, int end, int positionIncrement) {
+    Token token = new Token(text, start, end);
+    token.setPositionIncrement(positionIncrement);
+    return token;
+  }
+}
diff --git a/solr/core/src/test/org/apache/solr/spelling/SpellCheckCollatorWithSynonymTest.java b/solr/core/src/test/org/apache/solr/spelling/SpellCheckCollatorWithSynonymTest.java
@@ -0,0 +1,107 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.solr.spelling;
+
+import java.util.List;
+import org.apache.solr.SolrTestCaseJ4;
+import org.apache.solr.common.params.ModifiableSolrParams;
+import org.apache.solr.common.params.SpellingParams;
+import org.apache.solr.common.util.NamedList;
+import org.apache.solr.common.util.SimpleOrderedMap;
+import org.apache.solr.core.SolrCore;
+import org.apache.solr.handler.component.SearchComponent;
+import org.apache.solr.handler.component.SpellCheckComponent;
+import org.apache.solr.request.LocalSolrQueryRequest;
+import org.apache.solr.request.SolrQueryRequest;
+import org.apache.solr.request.SolrRequestHandler;
+import org.apache.solr.response.SolrQueryResponse;
+import org.junit.BeforeClass;
+import org.junit.Test;
+
+public class SpellCheckCollatorWithSynonymTest extends SolrTestCaseJ4 {
+
+  // ./gradlew -p solr/core test --tests SpellCheckCollatorWithSynonymTest
+
+  // TODO o.a.s.s.DirectSolrSpellChecker does not seem to handle synonyms boost correctly: synonym
+  // input is `panthera pardus, leopard|0.6`
+  //
+  // 1. One scenario generates a single token `leopard|0.6`
+  // (value,frequency)
+  // title_en:leopard|0.6: 0
+  // title_en:leopard: 3
+  // Q. should these be the same frequency?
+  //
+  // 2. Another scenario a list `leopard`,`0`,`6`
+  // (value,frequency):
+  // spelltest_t:leopard: 3
+  // spelltest_t:0: 1
+  // spelltest_t:6: 1
+  // Q. should `0` and `6` be here? they introduce a lot of noise in the collation
+  //
+  // Generated tokens with
+  // TokenizerChain(org.apache.solr.analysis.MockTokenizerFactory@258bab26,
+  // org.apache.lucene.analysis.synonym.SynonymGraphFilterFactory@3c8e9991,
+  // org.apache.lucene.analysis.core.StopFilterFactory@985a564,
+  // org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilterFactory@5fe66db9,
+  // org.apache.lucene.analysis.core.LowerCaseFilterFactory@274c3ff8,
+  // org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilterFactory@6fc67891,
+  // org.apache.lucene.analysis.en.PorterStemFilterFactory@4b4c5e88,
+  // org.apache.lucene.analysis.miscellaneous.RemoveDuplicatesTokenFilterFactory@272b79fd):
+
+  @BeforeClass
+  public static void beforeClass() throws Exception {
+    initCore("solrconfig-collapseqparser.xml", "schema11.xml");
+
+    assertU(adoc("id", "1", "spelltest_t", "Lorem panthera pardus ipsum 6 dolor sit amet"));
+    assertU(adoc("id", "2", "spelltest_t", "Lorem big cat pardus ipsum 0 dolor sit amet leopard"));
+    assertU(commit());
+  }
+
+  @Test
+  @SuppressWarnings({"unchecked", "rawtypes"})
+  public void testCollationWithHypens() {
+    SolrCore core = h.getCore();
+    SearchComponent speller = core.getSearchComponent("spellcheck");
+    assertNotNull("speller is null and it shouldn't be", speller);
+
+    ModifiableSolrParams params = new ModifiableSolrParams();
+    params.add(SpellCheckComponent.COMPONENT_NAME, "true");
+    params.add(SpellingParams.SPELLCHECK_BUILD, "true");
+    params.add(SpellingParams.SPELLCHECK_COUNT, "10");
+    params.add(SpellingParams.SPELLCHECK_COLLATE, "true");
+    params.add(SpellingParams.SPELLCHECK_ALTERNATIVE_TERM_COUNT, "5");
+    params.add("df", "spelltest_t");
+    params.add(SpellCheckComponent.SPELLCHECK_DICT, "direct_spelltest_t");
+    params.add(SpellCheckComponent.SPELLCHECK_Q, "panthera pardus cats");
+
+    SolrRequestHandler handler = core.getRequestHandler("/spellCheckCompRH_Direct");
+    SolrQueryResponse rsp = new SolrQueryResponse();
+    rsp.addResponseHeader(new SimpleOrderedMap());
+    SolrQueryRequest req = new LocalSolrQueryRequest(core, params);
+    handler.handleRequest(req, rsp);
+    req.close();
+
+    NamedList values = rsp.getValues();
+    NamedList spellCheck = (NamedList) values.get("spellcheck");
+    NamedList collationHolder = (NamedList) spellCheck.get("collations");
+    List<String> collations = collationHolder.getAll("collation");
+
+    assertEquals(1, collations.size());
+    String collation = collations.iterator().next();
+    assertEquals("Incorrect collation: " + collation, "leopard cat", collation);
+  }
+}