Skip to content

Commit 400cbb5

Browse files
ercsonusharmaSonu Sharmacpoerschkedsmiley
committed
SOLR-17319 : New Combined Query / hybrid search (RRF) (#3418)
New CombinedQuerySearchHandler etc. for implementing hybrid search with reciprocal rank fusion (RRF). See "JSON Combined Query DSL" in ref guide, and params prefixed with "combiner". QueryComponent: refactorings to enable a subclass to customize merging shard results. --------- Co-authored-by: Sonu Sharma <sonu_sharma2@apple.com> Co-authored-by: Christine Poerschke <cpoerschke@apache.org> Co-authored-by: David Smiley <dsmiley@apache.org> (cherry picked from commit 42cd889)
1 parent 9aab8b6 commit 400cbb5

File tree

19 files changed

+2608
-52
lines changed

19 files changed

+2608
-52
lines changed
Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
title: New CombinedQuerySearchHandler etc. for implementing hybrid search with reciprocal rank fusion (RRF).
2+
type: added
3+
authors:
4+
- name: Sonu Sharma
5+
- name: David Smiley
6+
links:
7+
- name: SOLR-17319
8+
url: https://issues.apache.org/jira/browse/SOLR-17319

solr/core/src/java/org/apache/solr/handler/component/CombinedQueryComponent.java

Lines changed: 622 additions & 0 deletions
Large diffs are not rendered by default.
Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one or more
3+
* contributor license agreements. See the NOTICE file distributed with
4+
* this work for additional information regarding copyright ownership.
5+
* The ASF licenses this file to You under the Apache License, Version 2.0
6+
* (the "License"); you may not use this file except in compliance with
7+
* the License. You may obtain a copy of the License at
8+
*
9+
* http://www.apache.org/licenses/LICENSE-2.0
10+
*
11+
* Unless required by applicable law or agreed to in writing, software
12+
* distributed under the License is distributed on an "AS IS" BASIS,
13+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
* See the License for the specific language governing permissions and
15+
* limitations under the License.
16+
*/
17+
package org.apache.solr.handler.component;
18+
19+
import java.util.ArrayList;
20+
import java.util.List;
21+
import org.apache.solr.request.SolrQueryRequest;
22+
import org.apache.solr.response.SolrQueryResponse;
23+
24+
/**
25+
* The CombinedQueryResponseBuilder class extends the ResponseBuilder class and is responsible for
26+
* building a combined response for multiple SearchComponent objects. It orchestrates the process of
27+
* constructing the SolrQueryResponse by aggregating results from various components.
28+
*/
29+
class CombinedQueryResponseBuilder extends ResponseBuilder {
30+
31+
final List<ResponseBuilder> responseBuilders = new ArrayList<>();
32+
33+
CombinedQueryResponseBuilder(
34+
SolrQueryRequest req, SolrQueryResponse rsp, List<SearchComponent> components) {
35+
super(req, rsp, components);
36+
}
37+
38+
/**
39+
* Propagates all the properties from parent ResponseBuilder to the all the children which are
40+
* being set later after the CombinedQueryComponent is prepared.
41+
*/
42+
final void propagate() {
43+
responseBuilders.forEach(
44+
thisRb -> {
45+
thisRb.setNeedDocSet(isNeedDocSet());
46+
thisRb.setNeedDocList(isNeedDocList());
47+
thisRb.doFacets = doFacets;
48+
thisRb.doHighlights = doHighlights;
49+
thisRb.doExpand = doExpand;
50+
thisRb.doTerms = doTerms;
51+
thisRb.doStats = doStats;
52+
thisRb.setDistribStatsDisabled(isDistribStatsDisabled());
53+
});
54+
}
55+
}
Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,64 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one or more
3+
* contributor license agreements. See the NOTICE file distributed with
4+
* this work for additional information regarding copyright ownership.
5+
* The ASF licenses this file to You under the Apache License, Version 2.0
6+
* (the "License"); you may not use this file except in compliance with
7+
* the License. You may obtain a copy of the License at
8+
*
9+
* http://www.apache.org/licenses/LICENSE-2.0
10+
*
11+
* Unless required by applicable law or agreed to in writing, software
12+
* distributed under the License is distributed on an "AS IS" BASIS,
13+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
* See the License for the specific language governing permissions and
15+
* limitations under the License.
16+
*/
17+
package org.apache.solr.handler.component;
18+
19+
import java.util.ArrayList;
20+
import java.util.List;
21+
import org.apache.solr.common.params.CombinerParams;
22+
import org.apache.solr.request.SolrQueryRequest;
23+
import org.apache.solr.response.SolrQueryResponse;
24+
25+
/**
26+
* Extends the SearchHandler combining/fusing multiple queries (e.g. RRF) when the {@link
27+
* CombinerParams#COMBINER} param is provided. If it isn't, does nothing special over SearchHandler.
28+
*
29+
* @see CombinedQueryComponent
30+
*/
31+
public class CombinedQuerySearchHandler extends SearchHandler {
32+
33+
/** Overrides to potentially return a custom {@link CombinedQueryResponseBuilder}. */
34+
@Override
35+
protected ResponseBuilder newResponseBuilder(
36+
SolrQueryRequest req, SolrQueryResponse rsp, List<SearchComponent> components) {
37+
if (req.getParams().getBool(CombinerParams.COMBINER, false)) {
38+
var rb = new CombinedQueryResponseBuilder(req, rsp, components);
39+
// CombinedQueryComponent is only designed to work with distributed search.
40+
rb.setForcedDistrib(true);
41+
return rb;
42+
}
43+
return super.newResponseBuilder(req, rsp, components);
44+
}
45+
46+
@Override
47+
protected void postPrepareComponents(ResponseBuilder rb) {
48+
super.postPrepareComponents(rb);
49+
// propagate the CombinedQueryResponseBuilder's state to all subBuilders after prepare
50+
if (rb instanceof CombinedQueryResponseBuilder) {
51+
var crb = (CombinedQueryResponseBuilder) rb;
52+
crb.propagate();
53+
}
54+
}
55+
56+
/** Overrides the default list to include {@link CombinedQueryComponent}. */
57+
@Override
58+
protected List<String> getDefaultComponents() {
59+
List<String> names = new ArrayList<>(super.getDefaultComponents());
60+
String replaced = names.set(0, CombinedQueryComponent.COMPONENT_NAME);
61+
assert replaced.equals(QueryComponent.COMPONENT_NAME);
62+
return names;
63+
}
64+
}

solr/core/src/java/org/apache/solr/handler/component/QueryComponent.java

Lines changed: 92 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -902,6 +902,67 @@ protected boolean addFL(StringBuilder fl, String field, boolean additionalAdded)
902902
return true;
903903
}
904904

905+
protected abstract static class ShardDocQueue {
906+
public abstract boolean push(ShardDoc shardDoc);
907+
908+
public abstract Map<Object, ShardDoc> resultIds(int offset);
909+
}
910+
;
911+
912+
protected ShardDocQueue newShardDocQueue(
913+
SolrIndexSearcher searcher, SortField[] sortFields, Integer size) {
914+
return new ShardDocQueue() {
915+
916+
// id to shard mapping, to eliminate any accidental dups
917+
private final HashMap<Object, String> uniqueDoc = new HashMap<>();
918+
919+
private final ShardFieldSortedHitQueue queue =
920+
new ShardFieldSortedHitQueue(sortFields, size, searcher);
921+
922+
@Override
923+
public boolean push(ShardDoc shardDoc) {
924+
final String prevShard = uniqueDoc.put(shardDoc.id, shardDoc.shard);
925+
if (prevShard != null) {
926+
// duplicate detected
927+
928+
// For now, just always use the first encountered since we can't currently
929+
// remove the previous one added to the priority queue. If we switched
930+
// to the Java5 PriorityQueue, this would be easier.
931+
return false;
932+
// make which duplicate is used deterministic based on shard
933+
// if (prevShard.compareTo(shardDoc.shard) >= 0) {
934+
// TODO: remove previous from priority queue
935+
// return false;
936+
// }
937+
}
938+
939+
queue.insertWithOverflow(shardDoc);
940+
return true;
941+
}
942+
943+
@Override
944+
public Map<Object, ShardDoc> resultIds(int offset) {
945+
final Map<Object, ShardDoc> resultIds = new HashMap<>();
946+
947+
// The queue now has 0 -> queuesize docs, where queuesize <= start + rows
948+
// So we want to pop the last documents off the queue to get
949+
// the docs offset -> queuesize
950+
int resultSize = queue.size() - offset;
951+
resultSize = Math.max(0, resultSize); // there may not be any docs in range
952+
953+
for (int i = resultSize - 1; i >= 0; i--) {
954+
ShardDoc shardDoc = queue.pop();
955+
shardDoc.positionInResponse = i;
956+
// Need the toString() for correlation with other lists that must
957+
// be strings (like keys in highlighting, explain, etc)
958+
resultIds.put(shardDoc.id.toString(), shardDoc);
959+
}
960+
961+
return resultIds;
962+
}
963+
};
964+
}
965+
905966
protected void mergeIds(ResponseBuilder rb, ShardRequest sreq) {
906967
List<MergeStrategy> mergeStrategies = rb.getMergeStrategies();
907968
if (mergeStrategies != null) {
@@ -944,14 +1005,10 @@ protected void mergeIds(ResponseBuilder rb, ShardRequest sreq) {
9441005
IndexSchema schema = rb.req.getSchema();
9451006
SchemaField uniqueKeyField = schema.getUniqueKeyField();
9461007

947-
// id to shard mapping, to eliminate any accidental dups
948-
HashMap<Object, String> uniqueDoc = new HashMap<>();
949-
9501008
// Merge the docs via a priority queue so we don't have to sort *all* of the
9511009
// documents... we only need to order the top (rows+start)
952-
final ShardFieldSortedHitQueue queue =
953-
new ShardFieldSortedHitQueue(
954-
sortFields, ss.getOffset() + ss.getCount(), rb.req.getSearcher());
1010+
final ShardDocQueue shardDocQueue =
1011+
newShardDocQueue(rb.req.getSearcher(), sortFields, ss.getOffset() + ss.getCount());
9551012

9561013
NamedList<Object> shardInfo = null;
9571014
if (rb.req.getParams().getBool(ShardParams.SHARDS_INFO, false)) {
@@ -1122,23 +1179,6 @@ protected void mergeIds(ResponseBuilder rb, ShardRequest sreq) {
11221179
for (int i = 0; i < docs.size(); i++) {
11231180
SolrDocument doc = docs.get(i);
11241181
Object id = doc.getFieldValue(uniqueKeyField.getName());
1125-
1126-
String prevShard = uniqueDoc.put(id, srsp.getShard());
1127-
if (prevShard != null) {
1128-
// duplicate detected
1129-
numFound--;
1130-
1131-
// For now, just always use the first encountered since we can't currently
1132-
// remove the previous one added to the priority queue. If we switched
1133-
// to the Java5 PriorityQueue, this would be easier.
1134-
continue;
1135-
// make which duplicate is used deterministic based on shard
1136-
// if (prevShard.compareTo(srsp.shard) >= 0) {
1137-
// TODO: remove previous from priority queue
1138-
// continue;
1139-
// }
1140-
}
1141-
11421182
ShardDoc shardDoc = new ShardDoc();
11431183
shardDoc.id = id;
11441184
shardDoc.shard = srsp.getShard();
@@ -1157,42 +1197,18 @@ protected void mergeIds(ResponseBuilder rb, ShardRequest sreq) {
11571197

11581198
shardDoc.sortFieldValues = unmarshalledSortFieldValues;
11591199

1160-
queue.insertWithOverflow(shardDoc);
1200+
if (!shardDocQueue.push(shardDoc)) {
1201+
numFound--;
1202+
}
11611203
} // end for-each-doc-in-response
11621204
} // end for-each-response
11631205

1164-
// The queue now has 0 -> queuesize docs, where queuesize <= start + rows
1165-
// So we want to pop the last documents off the queue to get
1166-
// the docs offset -> queuesize
1167-
int resultSize = queue.size() - ss.getOffset();
1168-
resultSize = Math.max(0, resultSize); // there may not be any docs in range
1169-
1170-
Map<Object, ShardDoc> resultIds = new HashMap<>();
1171-
for (int i = resultSize - 1; i >= 0; i--) {
1172-
ShardDoc shardDoc = queue.pop();
1173-
shardDoc.positionInResponse = i;
1174-
// Need the toString() for correlation with other lists that must
1175-
// be strings (like keys in highlighting, explain, etc)
1176-
resultIds.put(shardDoc.id.toString(), shardDoc);
1177-
}
1178-
11791206
// Add hits for distributed requests
11801207
// https://issues.apache.org/jira/browse/SOLR-3518
11811208
rb.rsp.addToLog("hits", numFound);
11821209

1183-
SolrDocumentList responseDocs = new SolrDocumentList();
1184-
if (maxScore != null) responseDocs.setMaxScore(maxScore);
1185-
responseDocs.setNumFound(numFound);
1186-
responseDocs.setNumFoundExact(hitCountIsExact);
1187-
responseDocs.setStart(ss.getOffset());
1188-
// size appropriately
1189-
for (int i = 0; i < resultSize; i++) responseDocs.add(null);
1190-
1191-
// save these results in a private area so we can access them
1192-
// again when retrieving stored fields.
1193-
// TODO: use ResponseBuilder (w/ comments) or the request context?
1194-
rb.resultIds = resultIds;
1195-
rb.setResponseDocs(responseDocs);
1210+
setResultIdsAndResponseDocs(
1211+
rb, shardDocQueue, maxScore, numFound, hitCountIsExact, ss.getOffset());
11961212

11971213
populateNextCursorMarkFromMergedShards(rb);
11981214

@@ -1238,6 +1254,30 @@ protected void mergeIds(ResponseBuilder rb, ShardRequest sreq) {
12381254
}
12391255
}
12401256

1257+
protected void setResultIdsAndResponseDocs(
1258+
ResponseBuilder rb,
1259+
ShardDocQueue shardDocQueue,
1260+
Float maxScore,
1261+
long numFound,
1262+
boolean hitCountIsExact,
1263+
int offset) {
1264+
final Map<Object, ShardDoc> resultIds = shardDocQueue.resultIds(offset);
1265+
1266+
final SolrDocumentList responseDocs = new SolrDocumentList();
1267+
if (maxScore != null) responseDocs.setMaxScore(maxScore);
1268+
responseDocs.setNumFound(numFound);
1269+
responseDocs.setNumFoundExact(hitCountIsExact);
1270+
responseDocs.setStart(offset);
1271+
// size appropriately
1272+
for (int i = 0; i < resultIds.size(); i++) responseDocs.add(null);
1273+
1274+
// save these results in a private area so we can access them
1275+
// again when retrieving stored fields.
1276+
// TODO: use ResponseBuilder (w/ comments) or the request context?
1277+
rb.resultIds = resultIds;
1278+
rb.setResponseDocs(responseDocs);
1279+
}
1280+
12411281
/**
12421282
* Inspects the state of the {@link ResponseBuilder} and populates the next {@link
12431283
* ResponseBuilder#setNextCursorMark} as appropriate based on the merged sort values from

0 commit comments

Comments
 (0)