Skip to content

Commit 6c4d427

Browse files
authored
Add NestedBestChildVectorScorer and KnnBinaryDocValuesScorer for exact search (#3179)
* Add NestedBestChildVectorScorer and KnnBinaryDocValuesScorer for exact search Introduce VectorScorer implementations for cases where Lucene's built-in vector values do not provide a scorer: - NestedBestChildVectorScorer: Groups child documents by parent and returns the best-scoring child per parent. Adapted from Lucene's DiversifyingChildrenVectorScorer to implement VectorScorer. Supports both filtered and unfiltered iteration. - KnnBinaryDocValuesScorer: Scores documents backed by BinaryDocValues by deserializing stored vectors and comparing against the query vector using SpaceType. Supports both float[] and byte[] query vectors via overloaded factory methods. Signed-off-by: Vijayan Balasubramanian <balasvij@amazon.com>
1 parent d7b5ecb commit 6c4d427

File tree

5 files changed

+752
-0
lines changed

5 files changed

+752
-0
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,3 +31,4 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
3131
* Speedup FP16 bulk similarity by precomputing the tail mask [#3172](https://github.com/opensearch-project/k-NN/pull/3172)
3232
* Add Prefetch functionality to prefetch vectors during ANN Search for MemoryOptimizedSearch. [#3173](https://github.com/opensearch-project/k-NN/pull/3173)
3333
* Optimize ByteVectorIdsExactKNNIterator by moving array conversion to constructor [#3171](https://github.com/opensearch-project/k-NN/pull/3171)
34+
* Add VectorScorers for BinaryDocValues and nested best child scoring [#3179](https://github.com/opensearch-project/k-NN/pull/3179)
Lines changed: 99 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,99 @@
1+
/*
2+
* Copyright OpenSearch Contributors
3+
* SPDX-License-Identifier: Apache-2.0
4+
*/
5+
6+
package org.opensearch.knn.index.query.scorers;
7+
8+
import org.apache.lucene.index.BinaryDocValues;
9+
import org.apache.lucene.search.DocIdSetIterator;
10+
import org.apache.lucene.search.VectorScorer;
11+
import org.apache.lucene.util.ArrayUtil;
12+
import org.apache.lucene.util.BytesRef;
13+
import org.opensearch.knn.index.SpaceType;
14+
import org.opensearch.knn.index.codec.util.KNNVectorAsCollectionOfFloatsSerializer;
15+
16+
import java.io.IOException;
17+
18+
/**
19+
* A {@link VectorScorer} backed by {@link BinaryDocValues}.
20+
*
21+
* <p>This scorer is used in the exact search path to score documents whose vectors are stored
22+
* as serialized bytes in {@link BinaryDocValues}. Document vectors are deserialized on the fly
23+
* and compared against the query vector using the similarity function derived from the
24+
* configured {@link SpaceType}.
25+
*
26+
* <p>Use the static factory methods to create an instance:
27+
* <ul>
28+
* <li>{@link #create(float[], BinaryDocValues, SpaceType)}
29+
* — for float[] query vectors. Document vectors are deserialized from {@link BytesRef} to
30+
* float[] via {@link KNNVectorAsCollectionOfFloatsSerializer}.</li>
31+
* <li>{@link #create(byte[], BinaryDocValues, SpaceType)}
32+
* — for byte[] query vectors. Document vectors are extracted as raw byte[] from the
33+
* {@link BytesRef}.</li>
34+
* </ul>
35+
*/
36+
public class KNNBinaryDocValuesScorer implements VectorScorer {
37+
38+
private final BinaryDocValues binaryDocValues;
39+
private final ScoreFunction scoreFunction;
40+
41+
/**
42+
* Strategy for computing a similarity score from a serialized document vector.
43+
*
44+
* <p>A functional interface is used here because the two factory methods ({@link #create(float[], BinaryDocValues, SpaceType)}
45+
* and {@link #create(byte[], BinaryDocValues, SpaceType)}) require different deserialization
46+
* and comparison logic. Each factory method captures its specific query vector type and
47+
* deserialization strategy in a lambda at construction time, avoiding runtime type checks
48+
* or branching in {@link #score()} on every call.
49+
*/
50+
@FunctionalInterface
51+
private interface ScoreFunction {
52+
float score(BytesRef bytesRef) throws IOException;
53+
}
54+
55+
private KNNBinaryDocValuesScorer(BinaryDocValues binaryDocValues, ScoreFunction scoreFunction) {
56+
this.binaryDocValues = binaryDocValues;
57+
this.scoreFunction = scoreFunction;
58+
}
59+
60+
/**
61+
* Creates a scorer for a float[] query vector.
62+
*
63+
* @param queryVector the query vector
64+
* @param binaryDocValues the binary doc values containing serialized document vectors
65+
* @param spaceType the space type defining the similarity function
66+
* @return a new {@link KNNBinaryDocValuesScorer}
67+
*/
68+
public static KNNBinaryDocValuesScorer create(float[] queryVector, BinaryDocValues binaryDocValues, SpaceType spaceType) {
69+
return new KNNBinaryDocValuesScorer(binaryDocValues, bytesRef -> {
70+
float[] docVector = KNNVectorAsCollectionOfFloatsSerializer.INSTANCE.byteToFloatArray(bytesRef);
71+
return spaceType.getKnnVectorSimilarityFunction().compare(queryVector, docVector);
72+
});
73+
}
74+
75+
/**
76+
* Creates a scorer for a byte[] query vector.
77+
*
78+
* @param queryVector the query vector
79+
* @param binaryDocValues the binary doc values containing serialized document vectors
80+
* @param spaceType the space type defining the similarity function
81+
* @return a new {@link KNNBinaryDocValuesScorer}
82+
*/
83+
public static KNNBinaryDocValuesScorer create(byte[] queryVector, BinaryDocValues binaryDocValues, SpaceType spaceType) {
84+
return new KNNBinaryDocValuesScorer(binaryDocValues, bytesRef -> {
85+
byte[] docVector = ArrayUtil.copyOfSubArray(bytesRef.bytes, bytesRef.offset, bytesRef.offset + bytesRef.length);
86+
return spaceType.getKnnVectorSimilarityFunction().compare(queryVector, docVector);
87+
});
88+
}
89+
90+
@Override
91+
public float score() throws IOException {
92+
return scoreFunction.score(binaryDocValues.binaryValue());
93+
}
94+
95+
@Override
96+
public DocIdSetIterator iterator() {
97+
return binaryDocValues;
98+
}
99+
}
Lines changed: 191 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,191 @@
1+
/*
2+
* Copyright OpenSearch Contributors
3+
* SPDX-License-Identifier: Apache-2.0
4+
*/
5+
6+
package org.opensearch.knn.index.query.scorers;
7+
8+
import org.apache.lucene.search.ConjunctionUtils;
9+
import org.apache.lucene.search.DocIdSetIterator;
10+
import org.apache.lucene.search.VectorScorer;
11+
import org.apache.lucene.util.BitSet;
12+
import org.opensearch.common.Nullable;
13+
14+
import java.util.Arrays;
15+
16+
import java.io.IOException;
17+
18+
/**
19+
* A {@link VectorScorer} decorator for nested (parent-child) document structures that groups
20+
* child documents by their parent and yields only the highest-scoring child per parent.
21+
*
22+
* <p>This is adapted from Lucene's {@code DiversifyingChildrenVectorScorer} inner class in
23+
* {@code DiversifyingChildrenFloatKnnVectorQuery}, re-implemented as a standalone {@link VectorScorer}
24+
* so it can be used in OpenSearch's exact search path.
25+
*
26+
* <h2>Document Layout</h2>
27+
* <p>Lucene block-joins store parent and child documents in contiguous doc-id ranges:
28+
* <pre>
29+
* [child_0, child_1, ..., child_n, PARENT, child_0, child_1, ..., child_m, PARENT, ...]
30+
* </pre>
31+
* The {@code parentBitSet} identifies which doc ids are parents. Every doc id between two
32+
* consecutive parent bits is a child of the later parent.
33+
*
34+
* <h2>Iteration Behavior</h2>
35+
* <p>Each call to {@link #iterator()}'s {@code nextDoc()} advances through one parent group:
36+
* <ol>
37+
* <li>Finds the next child document (respecting the optional filter).</li>
38+
* <li>Determines the parent for that child via {@code parentBitSet.nextSetBit()}.</li>
39+
* <li>Iterates over all children belonging to that parent, scoring each one.</li>
40+
* <li>Returns the doc id of the best-scoring child; {@link #score()} returns its score.</li>
41+
* </ol>
42+
*
43+
* <h2>Filtered vs Unfiltered</h2>
44+
* <ul>
45+
* <li><b>Unfiltered</b> ({@code acceptedChildrenIterator == null}): every vector document is
46+
* considered. The underlying vector iterator drives iteration directly.</li>
47+
* <li><b>Filtered</b>: the {@code filterIdsIterator} is intersected with the vector
48+
* iterator via {@link #maybeIntersectWithFilter}, producing a single iterator
49+
* that yields only doc ids present in both. This keeps the two iterators in lockstep
50+
* so the vector scorer is always positioned correctly when {@link #score()} is called.</li>
51+
* </ul>
52+
*
53+
* <h2>Example</h2>
54+
* <p>Given children [0,1,2,3,4] → parent 5, children [6,7,8] → parent 9, child [10] → parent 11,
55+
* and a filter that excludes children 2 and 7:
56+
* <pre>
57+
* Accepted children: {0, 1, 3, 4, 6, 8, 10}
58+
*
59+
* nextDoc() → bestChild=1 (best of {0,1,3,4} under parent 5)
60+
* nextDoc() → bestChild=6 (best of {6,8} under parent 9)
61+
* nextDoc() → bestChild=10 (only child under parent 11)
62+
* nextDoc() → NO_MORE_DOCS
63+
* </pre>
64+
*
65+
* @see org.apache.lucene.search.VectorScorer
66+
* @see org.apache.lucene.search.join.DiversifyingChildrenFloatKnnVectorQuery
67+
*/
68+
class NestedBestChildVectorScorer implements VectorScorer {
69+
private final VectorScorer childrenVectorScorer;
70+
private final DocIdSetIterator childIterator;
71+
private final BitSet parentBitSet;
72+
private final DocIdSetIterator iterator;
73+
private int bestChild = -1;
74+
private float currentScore = Float.NEGATIVE_INFINITY;
75+
76+
/**
77+
* Creates a scorer that finds the best-scoring child per parent, optionally restricted to a
78+
* subset of accepted children.
79+
*
80+
* <p>When {@code filterIdsIterator} is {@code null} (unfiltered), the scorer's own
81+
* vector iterator is used to drive child iteration, matching the behavior of Lucene's
82+
* {@code DiversifyingChildrenVectorScorer} but without requiring a separate filter iterator.
83+
*
84+
* @param filterIdsIterator iterator over the accepted child doc ids (i.e. children that
85+
* pass the filter). Pass {@code null} for the unfiltered case
86+
* where all vector documents are considered.
87+
* @param parentBitSet a {@link BitSet} with bits set at every parent doc id.
88+
* Used to determine parent boundaries for grouping children.
89+
* @param childrenVectorScorer the underlying scorer that computes similarity scores for
90+
* individual child documents against the query vector.
91+
*/
92+
public NestedBestChildVectorScorer(
93+
@Nullable DocIdSetIterator filterIdsIterator,
94+
BitSet parentBitSet,
95+
VectorScorer childrenVectorScorer
96+
) {
97+
this.childrenVectorScorer = childrenVectorScorer;
98+
this.parentBitSet = parentBitSet;
99+
DocIdSetIterator vectorIterator = childrenVectorScorer.iterator();
100+
this.childIterator = maybeIntersectWithFilter(vectorIterator, filterIdsIterator);
101+
this.iterator = createIterator();
102+
}
103+
104+
/**
105+
* Returns the score of the best-scoring child for the current parent group.
106+
* Only valid after a successful call to {@code iterator().nextDoc()}.
107+
*/
108+
@Override
109+
public float score() throws IOException {
110+
return currentScore;
111+
}
112+
113+
/**
114+
* Returns a {@link DocIdSetIterator} whose {@code nextDoc()} yields the doc id of the
115+
* best-scoring child for each successive parent. The same instance is returned on every call.
116+
*/
117+
@Override
118+
public DocIdSetIterator iterator() {
119+
return iterator;
120+
}
121+
122+
/**
123+
* Returns the vector iterator directly if no filter is provided, otherwise intersects
124+
* it with the filter so that only doc ids present in both are yielded.
125+
*/
126+
private static DocIdSetIterator maybeIntersectWithFilter(
127+
DocIdSetIterator vectorIterator,
128+
@Nullable DocIdSetIterator filterIdsIterator
129+
) {
130+
if (filterIdsIterator == null) {
131+
return vectorIterator;
132+
}
133+
return ConjunctionUtils.intersectIterators(Arrays.asList(filterIdsIterator, vectorIterator));
134+
}
135+
136+
/**
137+
* Creates a {@link DocIdSetIterator} that groups children by parent and yields the
138+
* best-scoring child per parent. Each {@code nextDoc()} call advances through one
139+
* parent group and returns the doc id of the highest-scoring child within that group.
140+
*/
141+
private DocIdSetIterator createIterator() {
142+
return new DocIdSetIterator() {
143+
@Override
144+
public int docID() {
145+
return bestChild;
146+
}
147+
148+
@Override
149+
public int nextDoc() throws IOException {
150+
int nextChild = childIterator.docID();
151+
if (nextChild == -1) {
152+
nextChild = childIterator.nextDoc();
153+
}
154+
if (nextChild == NO_MORE_DOCS) {
155+
bestChild = NO_MORE_DOCS;
156+
return NO_MORE_DOCS;
157+
}
158+
159+
currentScore = Float.NEGATIVE_INFINITY;
160+
int currentParent = parentBitSet.nextSetBit(nextChild);
161+
162+
do {
163+
float score = childrenVectorScorer.score();
164+
if (score > currentScore) {
165+
bestChild = nextChild;
166+
currentScore = score;
167+
}
168+
} while ((nextChild = childIterator.nextDoc()) != NO_MORE_DOCS && nextChild < currentParent);
169+
170+
return bestChild;
171+
}
172+
173+
/**
174+
* Not supported. This iterator returns the best-scoring child per parent group,
175+
* which requires evaluating <em>all</em> children within a group. Advancing to an
176+
* arbitrary target could land in the middle of a parent group, making it impossible
177+
* to consider earlier (potentially higher-scoring) children without backtracking
178+
* — violating the forward-only iterator contract.
179+
*/
180+
@Override
181+
public int advance(int target) {
182+
throw new UnsupportedOperationException();
183+
}
184+
185+
@Override
186+
public long cost() {
187+
return childIterator.cost();
188+
}
189+
};
190+
}
191+
}

0 commit comments

Comments
 (0)