Skip to content

Commit

Permalink
SOLR-13360 StringIndexOutOfBoundsException: String index out of range…
Browse files Browse the repository at this point in the history
…: -3
  • Loading branch information
stillalex committed Jan 24, 2025
1 parent b3b20d6 commit 7d45759
Show file tree
Hide file tree
Showing 4 changed files with 364 additions and 40 deletions.
146 changes: 106 additions & 40 deletions solr/core/src/java/org/apache/solr/spelling/SpellCheckCollator.java
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,8 @@
import java.util.Arrays;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import org.apache.lucene.index.IndexReader;
import org.apache.solr.common.params.CommonParams;
import org.apache.solr.common.params.CursorMarkParams;
Expand Down Expand Up @@ -212,61 +214,125 @@ public List<SpellCheckCollation> collate(
return collations;
}

private String getCollation(String origQuery, List<SpellCheckCorrection> corrections) {
static String getCollation(String origQuery, List<SpellCheckCorrection> correctionsAll) {
List<SpellCheckCorrection> corrections = mergeSpellCheckCorrections(correctionsAll);
StringBuilder collation = new StringBuilder(origQuery);
int offset = 0;
String corr = "";
for (int i = 0; i < corrections.size(); i++) {
SpellCheckCorrection correction = corrections.get(i);
for (SpellCheckCorrection correction : corrections) {
Token tok = correction.getOriginal();

// we are replacing the query in order, but injected terms might cause
// illegal offsets due to previous replacements.
if (tok.getPositionIncrement() == 0) continue;
corr = correction.getCorrection();
boolean addParenthesis = false;
Character requiredOrProhibited = null;
int indexOfSpace = corr.indexOf(' ');
StringBuilder corrSb = new StringBuilder(corr);
int bump = 1;

// If the correction contains whitespace (because it involved breaking a word in 2+ words),
// then be sure all of the new words have the same optional/required/prohibited status in the
// query.
while (indexOfSpace > -1 && indexOfSpace < corr.length() - 1) {
char previousChar = tok.startOffset() > 0 ? origQuery.charAt(tok.startOffset() - 1) : ' ';
if (previousChar == '-' || previousChar == '+') {
corrSb.insert(indexOfSpace + bump, previousChar);
if (requiredOrProhibited == null) {
requiredOrProhibited = previousChar;
}
bump++;
} else if ((tok.getFlags() & QueryConverter.TERM_IN_BOOLEAN_QUERY_FLAG)
== QueryConverter.TERM_IN_BOOLEAN_QUERY_FLAG) {
addParenthesis = true;
corrSb.insert(indexOfSpace + bump, "AND ");
bump += 4;
}
indexOfSpace = correction.getCorrection().indexOf(' ', indexOfSpace + bump);
}

int oneForReqOrProhib = 0;
if (addParenthesis) {
if (requiredOrProhibited != null) {
corrSb.insert(0, requiredOrProhibited);
oneForReqOrProhib++;
}
corrSb.insert(0, '(');
corrSb.append(')');
}
corr = corrSb.toString();
Entry<String, Integer> whitespaceCorrection =
whitespaceCorrection(origQuery, correction.getCorrection(), tok);
String corr = whitespaceCorrection.getKey();
int oneForReqOrProhib = whitespaceCorrection.getValue();

int startIndex = tok.startOffset() + offset - oneForReqOrProhib;

if (startIndex < 0) { // avoiding StringIndexOutOfBoundsException see SOLR-13360
logSIOOBEDebugData(origQuery, correctionsAll);
// is anything useful in the current collation or just give up?
// TODO return origQuery;
}

int endIndex = tok.endOffset() + offset;

collation.replace(startIndex, endIndex, corr);
offset += corr.length() - oneForReqOrProhib - (tok.endOffset() - tok.startOffset());
}
return collation.toString();
}

static Entry<String, Integer> whitespaceCorrection(
final String origQuery, final String correction, Token tok) {
String corr = correction;
boolean addParenthesis = false;
Character requiredOrProhibited = null;
int indexOfSpace = corr.indexOf(' ');
StringBuilder corrSb = new StringBuilder(corr);
int bump = 1;

// If the correction contains whitespace (because it involved breaking a word in 2+ words),
// then be sure all of the new words have the same optional/required/prohibited status in the
// query.
while (indexOfSpace > -1 && indexOfSpace < corr.length() - 1) {
char previousChar = tok.startOffset() > 0 ? origQuery.charAt(tok.startOffset() - 1) : ' ';
if (previousChar == '-' || previousChar == '+') {
corrSb.insert(indexOfSpace + bump, previousChar);
if (requiredOrProhibited == null) {
requiredOrProhibited = previousChar;
}
bump++;
} else if ((tok.getFlags() & QueryConverter.TERM_IN_BOOLEAN_QUERY_FLAG)
== QueryConverter.TERM_IN_BOOLEAN_QUERY_FLAG) {
addParenthesis = true;
corrSb.insert(indexOfSpace + bump, "AND ");
bump += 4;
}
indexOfSpace = correction.indexOf(' ', indexOfSpace + bump);
}

int oneForReqOrProhib = 0;
if (addParenthesis) {
if (requiredOrProhibited != null) {
corrSb.insert(0, requiredOrProhibited);
oneForReqOrProhib++;
}
corrSb.insert(0, '(');
corrSb.append(')');
}
corr = corrSb.toString();
return Map.entry(corr, oneForReqOrProhib);
}

private static volatile boolean mergeSpellCheckCorrections = true; // TODO system property

static List<SpellCheckCorrection> mergeSpellCheckCorrections(
List<SpellCheckCorrection> corrections) {
if (!mergeSpellCheckCorrections) {
return corrections;
}

// TODO fix overlapping tokan intervals
// - removing all entries with getPositionIncrement=0
// - sorting by token.startOffset
// - remove overlapping [startOffset, endOffset] intervals

return corrections;
}

private static volatile boolean logSIOOBEDebugData = true;

static void logSIOOBEDebugData(String origQuery, List<SpellCheckCorrection> corrections) {
if (!logSIOOBEDebugData) {
return;
}
logSIOOBEDebugData = false;
StringBuilder info = new StringBuilder(origQuery);
info.append("[");
for (SpellCheckCorrection correction : corrections) {
Token tok = correction.getOriginal();
info.append(
"('"
+ tok
+ "', "
+ tok.startOffset()
+ ", "
+ tok.endOffset()
+ ", "
+ tok.getPositionIncrement()
+ ", '"
+ correction.getOriginal()
+ "'),");
}
info.append("]");
info.append(", mergeSpellCheckCorrections=" + mergeSpellCheckCorrections);
log.warn("logging SIOOBE debug data, please report to SOLR-13360 {}", info);
}

public SpellCheckCollator setMaxCollations(int maxCollations) {
this.maxCollations = maxCollations;
return this;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -177,6 +177,13 @@
<str name="field">a_s</str>
<int name="minQueryLength">3</int>
</lst>

<lst name="spellchecker">
<str name="name">direct_spelltest_t</str>
<str name="classname">solr.DirectSolrSpellChecker</str>
<str name="field">spelltest_t</str>
<int name="minQueryLength">3</int>
</lst>
</searchComponent>

<!--
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,136 @@
package org.apache.solr.spelling;

import java.util.List;
import org.apache.solr.SolrTestCaseJ4;
import org.junit.Test;

public class SpellCheckCollatorCollationOnlyTest extends SolrTestCaseJ4 {

// Tests SOLR-13360 using some manual correction scenario based on various observations recorded
// in the ticket.
//
// Notes
// - tokens with (tok.getPositionIncrement() == 0) will be filtered out.
// - replacing longer tokens with shorter ones can make the offset go negative, which seems ok as
// long as the startOffset is always in (strict) increasing order

@Test
public void testCollationGeneration1() {
// sample from synonyms.txt:
// # Synonyms used in semantic expansion
// tabby => tabby, cat, feline, animal
// persian => persian, cat, feline, animal

String origQuery = "cats persians tabby";
List<SpellCheckCorrection> corrections =
List.of(
// ref 1st token
correction("cats", 0, 4, 1, "cat"), // original token. offset goes to -1

// ref 2nd token
correction("persian", 5, 13, 1, "persian"), // original token. offset goes to -2

// ref 3rd token
correction("tabbi", 14, 19, 1, "tabbi"), // original token
correction("cat", 14, 19, 0, "cat"), // synonym
correction("felin", 14, 19, 0, "felin"), // synonym
correction("anim", 14, 19, 0, "anim") // synonym
);

String collation = SpellCheckCollator.getCollation(origQuery, corrections);
String expected = "cat persian tabbi";
assertEquals("Incorrect collation: " + collation, expected, collation);
}

@Test
public void testCollationGeneration1Shuffle() {
// same as testCollationGeneration1 but I am manually shuffling the tokens

String origQuery = "cats persians tabby";
List<SpellCheckCorrection> corrections =
List.of(
// ref 2nd token
correction("persian", 5, 13, 1, "persian"), // original token. offset goes to -2

// ref 1st token
correction("cats", 0, 4, 1, "cat"), // original token. offset goes to -1

// ref 3rd token
correction("tabbi", 14, 19, 1, "tabbi"), // original token
correction("cat", 14, 19, 0, "cat"), // synonym
correction("felin", 14, 19, 0, "felin"), // synonym
correction("anim", 14, 19, 0, "anim") // synonym
);

String collation = SpellCheckCollator.getCollation(origQuery, corrections);
String expected = "cat persian tabbi";
assertEquals("Incorrect collation: " + collation, expected, collation);
}

@Test
public void testCollationGeneration1Repeat() {
// same as testCollationGeneration1 but I am manually repeating one of the tokens

String origQuery = "cats persians tabby";
List<SpellCheckCorrection> corrections =
List.of(
// ref 1st token
correction("cats", 0, 4, 1, "cat"), // original token. offset goes to -1

// ref 1st token - duplicated
correction("cats", 0, 4, 1, "cat"), // original token. offset goes to -1

// ref 2nd token
correction("persian", 5, 13, 1, "persian"), // original token. offset goes to -2

// ref 3rd token
correction("tabbi", 14, 19, 1, "tabbi"), // original token
correction("cat", 14, 19, 0, "cat"), // synonym
correction("felin", 14, 19, 0, "felin"), // synonym
correction("anim", 14, 19, 0, "anim") // synonym
);

String collation = SpellCheckCollator.getCollation(origQuery, corrections);
String expected = "cat persian tabbi";
assertEquals("Incorrect collation: " + collation, expected, collation);
}

@Test
public void testCollationGeneration2() {
// sample from synonyms.txt:
// panthera pardus, leopard|0.6
//
// Note. depending on the field type, this can end up as the list of tokens: [leopard, 0, 6,
// panthera, pardu, cats]

String origQuery = "panthera pardus cats";

List<SpellCheckCorrection> corrections =
List.of(
correction("leopard", 0, 15, 1, "leopard"),
correction("0", 0, 15, 1, "0"),
correction("6", 0, 15, 1, "6"),
correction("panthera", 0, 8, 0, "panthera"),
correction("pardu", 9, 15, 1, "pardu"),
correction("cats", 16, 20, 1, "cat"));

String collation = SpellCheckCollator.getCollation(origQuery, corrections);
String expected = "pardu cat"; // TODO what is expected here?
assertEquals("Incorrect collation: " + collation, expected, collation);
}

private static SpellCheckCorrection correction(
String text, int start, int end, int positionIncrement, String correction) {
SpellCheckCorrection spellCheckCorrection = new SpellCheckCorrection();
spellCheckCorrection.setOriginal(token(text, start, end, positionIncrement));
spellCheckCorrection.setCorrection(correction);
spellCheckCorrection.setNumberOfOccurences(1);
return spellCheckCorrection;
}

private static Token token(String text, int start, int end, int positionIncrement) {
Token token = new Token(text, start, end);
token.setPositionIncrement(positionIncrement);
return token;
}
}
Loading

0 comments on commit 7d45759

Please sign in to comment.