Skip to content

Commit 42cd889

Browse files
ercsonusharmaSonu Sharmacpoerschkedsmiley
authored
SOLR-17319 : New Combined Query / hybrid search (RRF) (#3418)
New CombinedQuerySearchHandler etc. for implementing hybrid search with reciprocal rank fusion (RRF). See "JSON Combined Query DSL" in ref guide, and params prefixed with "combiner". QueryComponent: refactorings to enable a subclass to customize merging shard results. --------- Co-authored-by: Sonu Sharma <sonu_sharma2@apple.com> Co-authored-by: Christine Poerschke <cpoerschke@apache.org> Co-authored-by: David Smiley <dsmiley@apache.org>
1 parent 11840af commit 42cd889

File tree

19 files changed

+2598
-52
lines changed

19 files changed

+2598
-52
lines changed
Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
title: New CombinedQuerySearchHandler etc. for implementing hybrid search with reciprocal rank fusion (RRF).
2+
type: added
3+
authors:
4+
- name: Sonu Sharma
5+
- name: David Smiley
6+
links:
7+
- name: SOLR-17319
8+
url: https://issues.apache.org/jira/browse/SOLR-17319

solr/core/src/java/org/apache/solr/handler/component/CombinedQueryComponent.java

Lines changed: 614 additions & 0 deletions
Large diffs are not rendered by default.
Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one or more
3+
* contributor license agreements. See the NOTICE file distributed with
4+
* this work for additional information regarding copyright ownership.
5+
* The ASF licenses this file to You under the Apache License, Version 2.0
6+
* (the "License"); you may not use this file except in compliance with
7+
* the License. You may obtain a copy of the License at
8+
*
9+
* http://www.apache.org/licenses/LICENSE-2.0
10+
*
11+
* Unless required by applicable law or agreed to in writing, software
12+
* distributed under the License is distributed on an "AS IS" BASIS,
13+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
* See the License for the specific language governing permissions and
15+
* limitations under the License.
16+
*/
17+
package org.apache.solr.handler.component;
18+
19+
import java.util.ArrayList;
20+
import java.util.List;
21+
import org.apache.solr.request.SolrQueryRequest;
22+
import org.apache.solr.response.SolrQueryResponse;
23+
24+
/**
25+
* The CombinedQueryResponseBuilder class extends the ResponseBuilder class and is responsible for
26+
* building a combined response for multiple SearchComponent objects. It orchestrates the process of
27+
* constructing the SolrQueryResponse by aggregating results from various components.
28+
*/
29+
class CombinedQueryResponseBuilder extends ResponseBuilder {
30+
31+
final List<ResponseBuilder> responseBuilders = new ArrayList<>();
32+
33+
CombinedQueryResponseBuilder(
34+
SolrQueryRequest req, SolrQueryResponse rsp, List<SearchComponent> components) {
35+
super(req, rsp, components);
36+
}
37+
38+
/**
39+
* Propagates all the properties from parent ResponseBuilder to the all the children which are
40+
* being set later after the CombinedQueryComponent is prepared.
41+
*/
42+
final void propagate() {
43+
responseBuilders.forEach(
44+
thisRb -> {
45+
thisRb.setNeedDocSet(isNeedDocSet());
46+
thisRb.setNeedDocList(isNeedDocList());
47+
thisRb.doFacets = doFacets;
48+
thisRb.doHighlights = doHighlights;
49+
thisRb.doExpand = doExpand;
50+
thisRb.doTerms = doTerms;
51+
thisRb.doStats = doStats;
52+
thisRb.setDistribStatsDisabled(isDistribStatsDisabled());
53+
});
54+
}
55+
}
Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one or more
3+
* contributor license agreements. See the NOTICE file distributed with
4+
* this work for additional information regarding copyright ownership.
5+
* The ASF licenses this file to You under the Apache License, Version 2.0
6+
* (the "License"); you may not use this file except in compliance with
7+
* the License. You may obtain a copy of the License at
8+
*
9+
* http://www.apache.org/licenses/LICENSE-2.0
10+
*
11+
* Unless required by applicable law or agreed to in writing, software
12+
* distributed under the License is distributed on an "AS IS" BASIS,
13+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
* See the License for the specific language governing permissions and
15+
* limitations under the License.
16+
*/
17+
package org.apache.solr.handler.component;
18+
19+
import java.util.ArrayList;
20+
import java.util.List;
21+
import org.apache.solr.common.params.CombinerParams;
22+
import org.apache.solr.request.SolrQueryRequest;
23+
import org.apache.solr.response.SolrQueryResponse;
24+
25+
/**
26+
* Extends the SearchHandler combining/fusing multiple queries (e.g. RRF) when the {@link
27+
* CombinerParams#COMBINER} param is provided. If it isn't, does nothing special over SearchHandler.
28+
*
29+
* @see CombinedQueryComponent
30+
*/
31+
public class CombinedQuerySearchHandler extends SearchHandler {
32+
33+
/** Overrides to potentially return a custom {@link CombinedQueryResponseBuilder}. */
34+
@Override
35+
protected ResponseBuilder newResponseBuilder(
36+
SolrQueryRequest req, SolrQueryResponse rsp, List<SearchComponent> components) {
37+
if (req.getParams().getBool(CombinerParams.COMBINER, false)) {
38+
var rb = new CombinedQueryResponseBuilder(req, rsp, components);
39+
// CombinedQueryComponent is only designed to work with distributed search.
40+
rb.setForcedDistrib(true);
41+
return rb;
42+
}
43+
return super.newResponseBuilder(req, rsp, components);
44+
}
45+
46+
@Override
47+
protected void postPrepareComponents(ResponseBuilder rb) {
48+
super.postPrepareComponents(rb);
49+
// propagate the CombinedQueryResponseBuilder's state to all subBuilders after prepare
50+
if (rb instanceof CombinedQueryResponseBuilder crb) {
51+
crb.propagate();
52+
}
53+
}
54+
55+
/** Overrides the default list to include {@link CombinedQueryComponent}. */
56+
@Override
57+
protected List<String> getDefaultComponents() {
58+
List<String> names = new ArrayList<>(super.getDefaultComponents());
59+
String replaced = names.set(0, CombinedQueryComponent.COMPONENT_NAME);
60+
assert replaced.equals(QueryComponent.COMPONENT_NAME);
61+
return names;
62+
}
63+
}

solr/core/src/java/org/apache/solr/handler/component/QueryComponent.java

Lines changed: 92 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -907,6 +907,67 @@ protected boolean addFL(StringBuilder fl, String field, boolean additionalAdded)
907907
return true;
908908
}
909909

910+
protected abstract static class ShardDocQueue {
911+
public abstract boolean push(ShardDoc shardDoc);
912+
913+
public abstract Map<Object, ShardDoc> resultIds(int offset);
914+
}
915+
;
916+
917+
protected ShardDocQueue newShardDocQueue(
918+
SolrIndexSearcher searcher, SortField[] sortFields, Integer size) {
919+
return new ShardDocQueue() {
920+
921+
// id to shard mapping, to eliminate any accidental dups
922+
private final HashMap<Object, String> uniqueDoc = new HashMap<>();
923+
924+
private final ShardFieldSortedHitQueue queue =
925+
new ShardFieldSortedHitQueue(sortFields, size, searcher);
926+
927+
@Override
928+
public boolean push(ShardDoc shardDoc) {
929+
final String prevShard = uniqueDoc.put(shardDoc.id, shardDoc.shard);
930+
if (prevShard != null) {
931+
// duplicate detected
932+
933+
// For now, just always use the first encountered since we can't currently
934+
// remove the previous one added to the priority queue. If we switched
935+
// to the Java5 PriorityQueue, this would be easier.
936+
return false;
937+
// make which duplicate is used deterministic based on shard
938+
// if (prevShard.compareTo(shardDoc.shard) >= 0) {
939+
// TODO: remove previous from priority queue
940+
// return false;
941+
// }
942+
}
943+
944+
queue.insertWithOverflow(shardDoc);
945+
return true;
946+
}
947+
948+
@Override
949+
public Map<Object, ShardDoc> resultIds(int offset) {
950+
final Map<Object, ShardDoc> resultIds = new HashMap<>();
951+
952+
// The queue now has 0 -> queuesize docs, where queuesize <= start + rows
953+
// So we want to pop the last documents off the queue to get
954+
// the docs offset -> queuesize
955+
int resultSize = queue.size() - offset;
956+
resultSize = Math.max(0, resultSize); // there may not be any docs in range
957+
958+
for (int i = resultSize - 1; i >= 0; i--) {
959+
ShardDoc shardDoc = queue.pop();
960+
shardDoc.positionInResponse = i;
961+
// Need the toString() for correlation with other lists that must
962+
// be strings (like keys in highlighting, explain, etc)
963+
resultIds.put(shardDoc.id.toString(), shardDoc);
964+
}
965+
966+
return resultIds;
967+
}
968+
};
969+
}
970+
910971
protected void mergeIds(ResponseBuilder rb, ShardRequest sreq) {
911972
List<MergeStrategy> mergeStrategies = rb.getMergeStrategies();
912973
if (mergeStrategies != null) {
@@ -949,14 +1010,10 @@ protected void mergeIds(ResponseBuilder rb, ShardRequest sreq) {
9491010
IndexSchema schema = rb.req.getSchema();
9501011
SchemaField uniqueKeyField = schema.getUniqueKeyField();
9511012

952-
// id to shard mapping, to eliminate any accidental dups
953-
HashMap<Object, String> uniqueDoc = new HashMap<>();
954-
9551013
// Merge the docs via a priority queue so we don't have to sort *all* of the
9561014
// documents... we only need to order the top (rows+start)
957-
final ShardFieldSortedHitQueue queue =
958-
new ShardFieldSortedHitQueue(
959-
sortFields, ss.getOffset() + ss.getCount(), rb.req.getSearcher());
1015+
final ShardDocQueue shardDocQueue =
1016+
newShardDocQueue(rb.req.getSearcher(), sortFields, ss.getOffset() + ss.getCount());
9601017

9611018
NamedList<Object> shardInfo = null;
9621019
if (rb.req.getParams().getBool(ShardParams.SHARDS_INFO, false)) {
@@ -1127,23 +1184,6 @@ protected void mergeIds(ResponseBuilder rb, ShardRequest sreq) {
11271184
for (int i = 0; i < docs.size(); i++) {
11281185
SolrDocument doc = docs.get(i);
11291186
Object id = doc.getFieldValue(uniqueKeyField.getName());
1130-
1131-
String prevShard = uniqueDoc.put(id, srsp.getShard());
1132-
if (prevShard != null) {
1133-
// duplicate detected
1134-
numFound--;
1135-
1136-
// For now, just always use the first encountered since we can't currently
1137-
// remove the previous one added to the priority queue. If we switched
1138-
// to the Java5 PriorityQueue, this would be easier.
1139-
continue;
1140-
// make which duplicate is used deterministic based on shard
1141-
// if (prevShard.compareTo(srsp.shard) >= 0) {
1142-
// TODO: remove previous from priority queue
1143-
// continue;
1144-
// }
1145-
}
1146-
11471187
ShardDoc shardDoc = new ShardDoc();
11481188
shardDoc.id = id;
11491189
shardDoc.shard = srsp.getShard();
@@ -1162,42 +1202,18 @@ protected void mergeIds(ResponseBuilder rb, ShardRequest sreq) {
11621202

11631203
shardDoc.sortFieldValues = unmarshalledSortFieldValues;
11641204

1165-
queue.insertWithOverflow(shardDoc);
1205+
if (!shardDocQueue.push(shardDoc)) {
1206+
numFound--;
1207+
}
11661208
} // end for-each-doc-in-response
11671209
} // end for-each-response
11681210

1169-
// The queue now has 0 -> queuesize docs, where queuesize <= start + rows
1170-
// So we want to pop the last documents off the queue to get
1171-
// the docs offset -> queuesize
1172-
int resultSize = queue.size() - ss.getOffset();
1173-
resultSize = Math.max(0, resultSize); // there may not be any docs in range
1174-
1175-
Map<Object, ShardDoc> resultIds = new HashMap<>();
1176-
for (int i = resultSize - 1; i >= 0; i--) {
1177-
ShardDoc shardDoc = queue.pop();
1178-
shardDoc.positionInResponse = i;
1179-
// Need the toString() for correlation with other lists that must
1180-
// be strings (like keys in highlighting, explain, etc)
1181-
resultIds.put(shardDoc.id.toString(), shardDoc);
1182-
}
1183-
11841211
// Add hits for distributed requests
11851212
// https://issues.apache.org/jira/browse/SOLR-3518
11861213
rb.rsp.addToLog("hits", numFound);
11871214

1188-
SolrDocumentList responseDocs = new SolrDocumentList();
1189-
if (maxScore != null) responseDocs.setMaxScore(maxScore);
1190-
responseDocs.setNumFound(numFound);
1191-
responseDocs.setNumFoundExact(hitCountIsExact);
1192-
responseDocs.setStart(ss.getOffset());
1193-
// size appropriately
1194-
for (int i = 0; i < resultSize; i++) responseDocs.add(null);
1195-
1196-
// save these results in a private area so we can access them
1197-
// again when retrieving stored fields.
1198-
// TODO: use ResponseBuilder (w/ comments) or the request context?
1199-
rb.resultIds = resultIds;
1200-
rb.setResponseDocs(responseDocs);
1215+
setResultIdsAndResponseDocs(
1216+
rb, shardDocQueue, maxScore, numFound, hitCountIsExact, ss.getOffset());
12011217

12021218
populateNextCursorMarkFromMergedShards(rb);
12031219

@@ -1243,6 +1259,30 @@ protected void mergeIds(ResponseBuilder rb, ShardRequest sreq) {
12431259
}
12441260
}
12451261

1262+
protected void setResultIdsAndResponseDocs(
1263+
ResponseBuilder rb,
1264+
ShardDocQueue shardDocQueue,
1265+
Float maxScore,
1266+
long numFound,
1267+
boolean hitCountIsExact,
1268+
int offset) {
1269+
final Map<Object, ShardDoc> resultIds = shardDocQueue.resultIds(offset);
1270+
1271+
final SolrDocumentList responseDocs = new SolrDocumentList();
1272+
if (maxScore != null) responseDocs.setMaxScore(maxScore);
1273+
responseDocs.setNumFound(numFound);
1274+
responseDocs.setNumFoundExact(hitCountIsExact);
1275+
responseDocs.setStart(offset);
1276+
// size appropriately
1277+
for (int i = 0; i < resultIds.size(); i++) responseDocs.add(null);
1278+
1279+
// save these results in a private area so we can access them
1280+
// again when retrieving stored fields.
1281+
// TODO: use ResponseBuilder (w/ comments) or the request context?
1282+
rb.resultIds = resultIds;
1283+
rb.setResponseDocs(responseDocs);
1284+
}
1285+
12461286
/**
12471287
* Inspects the state of the {@link ResponseBuilder} and populates the next {@link
12481288
* ResponseBuilder#setNextCursorMark} as appropriate based on the merged sort values from

0 commit comments

Comments
 (0)