Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

SOLR-13360 StringIndexOutOfBoundsException: String index out of range #3112

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -386,7 +386,7 @@ public String toString() {
}
}

private static class StartOffsetComparator implements Comparator<SpellCheckCorrection> {
static class StartOffsetComparator implements Comparator<SpellCheckCorrection> {
@Override
public int compare(SpellCheckCorrection o1, SpellCheckCorrection o2) {
return o1.getOriginal().startOffset() - o2.getOriginal().startOffset();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,14 @@ public class SpellCheckCollator {
private boolean suggestionsMayOverlap = false;
private int docCollectionLimit = 0;

private static volatile boolean mergeSpellCheckCorrections =
Boolean.parseBoolean(
System.getProperty("solr.SpellCheckCollator.mergeSpellCheckCorrections", "true"));

private static volatile boolean logSIOOBEDebugData =
Boolean.parseBoolean(
System.getProperty("solr.SpellCheckCollator.logSIOOBEDebugData", "true"));

public List<SpellCheckCollation> collate(
SpellingResult result, String originalQuery, ResponseBuilder ultimateResponse) {
List<SpellCheckCollation> collations = new ArrayList<>();
Expand Down Expand Up @@ -212,7 +220,8 @@ public List<SpellCheckCollation> collate(
return collations;
}

private String getCollation(String origQuery, List<SpellCheckCorrection> corrections) {
static String getCollation(String origQuery, List<SpellCheckCorrection> correctionsAll) {
List<SpellCheckCorrection> corrections = mergeSpellCheckCorrections(correctionsAll);
StringBuilder collation = new StringBuilder(origQuery);
int offset = 0;
String corr = "";
Expand Down Expand Up @@ -260,13 +269,74 @@ private String getCollation(String origQuery, List<SpellCheckCorrection> correct
}
corr = corrSb.toString();
int startIndex = tok.startOffset() + offset - oneForReqOrProhib;

if (startIndex < 0) { // avoiding StringIndexOutOfBoundsException see SOLR-13360
logSIOOBEDebugData(origQuery, correctionsAll);
break;
}

int endIndex = tok.endOffset() + offset;
collation.replace(startIndex, endIndex, corr);
offset += corr.length() - oneForReqOrProhib - (tok.endOffset() - tok.startOffset());
}
return collation.toString();
}

static List<SpellCheckCorrection> mergeSpellCheckCorrections(
List<SpellCheckCorrection> corrections) {
if (!mergeSpellCheckCorrections) {
return corrections;
}

// Note. fix overlapping token intervals
// - sorting by token.startOffset. unclear if this can cause some reshuffle where lower rank
// items with the same position can come first
// - remove overlapping [startOffset, endOffset] intervals

List<SpellCheckCorrection> filtered = new ArrayList<SpellCheckCorrection>(corrections);
filtered.sort(new PossibilityIterator.StartOffsetComparator());

int end = -1;
for (Iterator<SpellCheckCorrection> iterator = filtered.iterator(); iterator.hasNext(); ) {
SpellCheckCorrection correction = iterator.next();
Token t = correction.getOriginal();
if (t.startOffset() > end && t.endOffset() > end) {
end = t.endOffset();
} else {
iterator.remove();
}
}

return filtered;
}

static void logSIOOBEDebugData(String origQuery, List<SpellCheckCorrection> corrections) {
if (!logSIOOBEDebugData) {
return;
}
logSIOOBEDebugData = false;
StringBuilder info = new StringBuilder(origQuery);
info.append("[");
for (SpellCheckCorrection correction : corrections) {
Token tok = correction.getOriginal();
info.append(
"('"
+ tok
+ "', "
+ tok.startOffset()
+ ", "
+ tok.endOffset()
+ ", "
+ tok.getPositionIncrement()
+ ", '"
+ correction.getOriginal()
+ "'),");
}
info.append("]");
info.append(", mergeSpellCheckCorrections=" + mergeSpellCheckCorrections);
log.warn("logging SIOOBE debug data, please report to SOLR-13360 {}", info);
}

public SpellCheckCollator setMaxCollations(int maxCollations) {
this.maxCollations = maxCollations;
return this;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -177,6 +177,13 @@
<str name="field">a_s</str>
<int name="minQueryLength">3</int>
</lst>

<lst name="spellchecker">
<str name="name">direct_spelltest_t</str>
<str name="classname">solr.DirectSolrSpellChecker</str>
<str name="field">spelltest_t</str>
<int name="minQueryLength">3</int>
</lst>
</searchComponent>

<!--
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,152 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.spelling;

import java.util.List;
import org.apache.solr.SolrTestCaseJ4;
import org.junit.Test;

public class SpellCheckCollatorCollationOnlyTest extends SolrTestCaseJ4 {

// Tests SOLR-13360 using some manual correction scenario based on various observations recorded
// in the ticket.
//
// Notes
// - tokens with (tok.getPositionIncrement() == 0) will be filtered out.
// - replacing longer tokens with shorter ones can make the offset go negative, which seems ok as
// long as the startOffset is always in (strict) increasing order

@Test
public void testCollationGeneration1() {
// sample from synonyms.txt:
// # Synonyms used in semantic expansion
// tabby => tabby, cat, feline, animal
// persian => persian, cat, feline, animal

String origQuery = "cats persians tabby";
List<SpellCheckCorrection> corrections =
List.of(
// ref 1st token
correction("cats", 0, 4, 1, "cat"), // original token. offset goes to -1

// ref 2nd token
correction("persian", 5, 13, 1, "persian"), // original token. offset goes to -2

// ref 3rd token
correction("tabbi", 14, 19, 1, "tabbi"), // original token
correction("cat", 14, 19, 0, "cat"), // synonym
correction("felin", 14, 19, 0, "felin"), // synonym
correction("anim", 14, 19, 0, "anim") // synonym
);

String collation = SpellCheckCollator.getCollation(origQuery, corrections);
String expected = "cat persian tabbi";
assertEquals("Incorrect collation: " + collation, expected, collation);
}

@Test
public void testCollationGeneration1Shuffle() {
// same as testCollationGeneration1 but I am manually shuffling the tokens

String origQuery = "cats persians tabby";
List<SpellCheckCorrection> corrections =
List.of(
// ref 2nd token
correction("persian", 5, 13, 1, "persian"), // original token. offset goes to -2

// ref 1st token
correction("cats", 0, 4, 1, "cat"), // original token. offset goes to -1

// ref 3rd token
correction("tabbi", 14, 19, 1, "tabbi"), // original token
correction("cat", 14, 19, 0, "cat"), // synonym
correction("felin", 14, 19, 0, "felin"), // synonym
correction("anim", 14, 19, 0, "anim") // synonym
);

String collation = SpellCheckCollator.getCollation(origQuery, corrections);
String expected = "cat persian tabbi";
assertEquals("Incorrect collation: " + collation, expected, collation);
}

@Test
public void testCollationGeneration1Repeat() {
// same as testCollationGeneration1 but I am manually repeating one of the tokens

String origQuery = "cats persians tabby";
List<SpellCheckCorrection> corrections =
List.of(
// ref 1st token
correction("cats", 0, 4, 1, "cat"), // original token. offset goes to -1

// ref 1st token - duplicated
correction("cats", 0, 4, 1, "cat"), // original token. offset goes to -1

// ref 2nd token
correction("persian", 5, 13, 1, "persian"), // original token. offset goes to -2

// ref 3rd token
correction("tabbi", 14, 19, 1, "tabbi"), // original token
correction("cat", 14, 19, 0, "cat"), // synonym
correction("felin", 14, 19, 0, "felin"), // synonym
correction("anim", 14, 19, 0, "anim") // synonym
);

String collation = SpellCheckCollator.getCollation(origQuery, corrections);
String expected = "cat persian tabbi";
assertEquals("Incorrect collation: " + collation, expected, collation);
}

@Test
public void testCollationGeneration2() {
// sample from synonyms.txt:
// panthera pardus, leopard|0.6
//
// Note. depending on the field type, this can end up as the list of tokens: [leopard, 0, 6,
// panthera, pardu, cats]

String origQuery = "panthera pardus cats";

List<SpellCheckCorrection> corrections =
List.of(
correction("leopard", 0, 15, 1, "leopard"),
correction("0", 0, 15, 1, "0"),
correction("6", 0, 15, 1, "6"),
correction("panthera", 0, 8, 0, "panthera"),
correction("pardu", 9, 15, 1, "pardu"),
correction("cats", 16, 20, 1, "cat"));

String collation = SpellCheckCollator.getCollation(origQuery, corrections);
String expected = "leopard cat";
assertEquals("Incorrect collation: " + collation, expected, collation);
}

private static SpellCheckCorrection correction(
String text, int start, int end, int positionIncrement, String correction) {
SpellCheckCorrection spellCheckCorrection = new SpellCheckCorrection();
spellCheckCorrection.setOriginal(token(text, start, end, positionIncrement));
spellCheckCorrection.setCorrection(correction);
spellCheckCorrection.setNumberOfOccurences(1);
return spellCheckCorrection;
}

private static Token token(String text, int start, int end, int positionIncrement) {
Token token = new Token(text, start, end);
token.setPositionIncrement(positionIncrement);
return token;
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.spelling;

import java.util.List;
import org.apache.solr.SolrTestCaseJ4;
import org.apache.solr.common.params.ModifiableSolrParams;
import org.apache.solr.common.params.SpellingParams;
import org.apache.solr.common.util.NamedList;
import org.apache.solr.common.util.SimpleOrderedMap;
import org.apache.solr.core.SolrCore;
import org.apache.solr.handler.component.SearchComponent;
import org.apache.solr.handler.component.SpellCheckComponent;
import org.apache.solr.request.LocalSolrQueryRequest;
import org.apache.solr.request.SolrQueryRequest;
import org.apache.solr.request.SolrRequestHandler;
import org.apache.solr.response.SolrQueryResponse;
import org.junit.BeforeClass;
import org.junit.Test;

public class SpellCheckCollatorWithSynonymTest extends SolrTestCaseJ4 {

// ./gradlew -p solr/core test --tests SpellCheckCollatorWithSynonymTest

// TODO o.a.s.s.DirectSolrSpellChecker does not seem to handle synonyms boost correctly: synonym
// input is `panthera pardus, leopard|0.6`
//
// 1. One scenario generates a single token `leopard|0.6`
// (value,frequency)
// title_en:leopard|0.6: 0
// title_en:leopard: 3
// Q. should these be the same frequency?
//
// 2. Another scenario a list `leopard`,`0`,`6`
// (value,frequency):
// spelltest_t:leopard: 3
// spelltest_t:0: 1
// spelltest_t:6: 1
// Q. should `0` and `6` be here? they introduce a lot of noise in the collation
//
// Generated tokens with
// TokenizerChain(org.apache.solr.analysis.MockTokenizerFactory@258bab26,
// org.apache.lucene.analysis.synonym.SynonymGraphFilterFactory@3c8e9991,
// org.apache.lucene.analysis.core.StopFilterFactory@985a564,
// org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilterFactory@5fe66db9,
// org.apache.lucene.analysis.core.LowerCaseFilterFactory@274c3ff8,
// org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilterFactory@6fc67891,
// org.apache.lucene.analysis.en.PorterStemFilterFactory@4b4c5e88,
// org.apache.lucene.analysis.miscellaneous.RemoveDuplicatesTokenFilterFactory@272b79fd):

@BeforeClass
public static void beforeClass() throws Exception {
initCore("solrconfig-collapseqparser.xml", "schema11.xml");

assertU(adoc("id", "1", "spelltest_t", "Lorem panthera pardus ipsum 6 dolor sit amet"));
assertU(adoc("id", "2", "spelltest_t", "Lorem big cat pardus ipsum 0 dolor sit amet leopard"));
assertU(commit());
}

@Test
@SuppressWarnings({"unchecked", "rawtypes"})
public void testCollationWithHypens() {
SolrCore core = h.getCore();
SearchComponent speller = core.getSearchComponent("spellcheck");
assertNotNull("speller is null and it shouldn't be", speller);

ModifiableSolrParams params = new ModifiableSolrParams();
params.add(SpellCheckComponent.COMPONENT_NAME, "true");
params.add(SpellingParams.SPELLCHECK_BUILD, "true");
params.add(SpellingParams.SPELLCHECK_COUNT, "10");
params.add(SpellingParams.SPELLCHECK_COLLATE, "true");
params.add(SpellingParams.SPELLCHECK_ALTERNATIVE_TERM_COUNT, "5");
params.add("df", "spelltest_t");
params.add(SpellCheckComponent.SPELLCHECK_DICT, "direct_spelltest_t");
params.add(SpellCheckComponent.SPELLCHECK_Q, "panthera pardus cats");

SolrRequestHandler handler = core.getRequestHandler("/spellCheckCompRH_Direct");
SolrQueryResponse rsp = new SolrQueryResponse();
rsp.addResponseHeader(new SimpleOrderedMap());
SolrQueryRequest req = new LocalSolrQueryRequest(core, params);
handler.handleRequest(req, rsp);
req.close();

NamedList values = rsp.getValues();
NamedList spellCheck = (NamedList) values.get("spellcheck");
NamedList collationHolder = (NamedList) spellCheck.get("collations");
List<String> collations = collationHolder.getAll("collation");

assertEquals(1, collations.size());
String collation = collations.iterator().next();
assertEquals("Incorrect collation: " + collation, "leopard cat", collation);
}
}
Loading