Added default bulk simd bbq scoring.

0ctopus13prime · 0ctopus13prime · commit ad5507ad470a · 2026-03-18T18:44:09.000-07:00
Signed-off-by: Dooyong Kim &lt;kdooyong@amazon.com&gt;
diff --git a/jni/src/simd/similarity_function/default_simd_similarity_function.cpp b/jni/src/simd/similarity_function/default_simd_similarity_function.cpp
@@ -1,6 +1,7 @@
 #include <algorithm>
 #include <array>
 #include <cstddef>
+#include <cstring>
 #include <stdint.h>
 #include <cmath>
 
@@ -44,12 +45,268 @@ DefaultFP16SimilarityFunction<FaissScoreToLuceneScoreTransform::ipToMaxIpTransfo
 // 2. L2
 DefaultFP16SimilarityFunction<FaissScoreToLuceneScoreTransform::l2TransformBulk, FaissScoreToLuceneScoreTransform::l2Transform> DEFAULT_FP16_L2_SIMIL_FUNC;
 
+
+//
+// BBQ (ADC: 4-bit query x 1-bit data) - Default (non-SIMD) implementation
+//
+// Pure C++ implementation written for compiler auto-vectorization with -O3.
+// No SIMD intrinsics. Uses __builtin_popcount[ll] and std::memcpy.
+//
+// The query is 4-bit quantized and transposed into 4 bit planes (via transposeHalfByte).
+// Each bit plane has `binaryCodeBytes` bytes. The int4BitDotProduct computes:
+//   Result = popcount(plane0 AND data) * 1
+//          + popcount(plane1 AND data) * 2
+//          + popcount(plane2 AND data) * 4
+//          + popcount(plane3 AND data) * 8
+//
+
+static constexpr float FOUR_BIT_SCALE = 1.0f / 15.0f;
+
+// Reads the per-vector correction factors from a potentially unaligned address.
+// On-disk layout after binaryCode: [lowerInterval(f32)][upperInterval(f32)][additionalCorrection(f32)][quantizedComponentSum(i32)]
+static FORCE_INLINE void readDataCorrections(const uint8_t* ptr, float& ax, float& lx, float& additional, float& x1) {
+    float lower, upper;
+    std::memcpy(&lower,      ptr,      sizeof(float));
+    std::memcpy(&upper,      ptr + 4,  sizeof(float));
+    std::memcpy(&additional, ptr + 8,  sizeof(float));
+    int32_t componentSum;
+    std::memcpy(&componentSum, ptr + 12, sizeof(int32_t));
+    ax = lower;
+    lx = upper - lower;
+    x1 = static_cast<float>(componentSum);
+}
+
+// Scalar int4BitDotProduct
+// q has 4 * binaryCodeBytes bytes (4 bit planes), d has binaryCodeBytes bytes.
+// Uses std::memcpy for uint64_t loads to avoid undefined behavior from unaligned
+// reinterpret_cast when binaryCodeBytes is not a multiple of 8.
+static FORCE_INLINE int64_t int4BitDotProduct(const uint8_t* q, const uint8_t* d, const int32_t binaryCodeBytes) {
+    int64_t result = 0;
+    for (int32_t bitPlane = 0 ; bitPlane < 4 ; ++bitPlane) {
+        const int32_t words = binaryCodeBytes >> 3;
+
+        int64_t subResult = 0;
+        for (int32_t w = 0 ; w < words ; ++w) {
+            uint64_t qWord, dWord;
+            std::memcpy(&qWord, q + bitPlane * binaryCodeBytes + w * 8, sizeof(uint64_t));
+            std::memcpy(&dWord, d + w * 8, sizeof(uint64_t));
+            subResult += __builtin_popcountll(qWord & dWord);
+        }
+
+        const int32_t remainStart = words * 8;
+        for (int32_t r = remainStart ; r < binaryCodeBytes ; ++r) {
+            subResult += __builtin_popcount((q[bitPlane * binaryCodeBytes + r] & d[r]) & 0xFF);
+        }
+
+        result += subResult << bitPlane;
+    }
+    return result;
+}
+
+// Default (non-SIMD) batched int4BitDotProduct.
+// Processes one batch element at a time with the byte loop as the inner loop,
+// giving the compiler the best auto-vectorization opportunity across the byte dimension.
+template <int BATCH_SIZE>
+static FORCE_INLINE void default4bitDotProductBatch(
+    const uint8_t* queryPtr,
+    uint8_t** dataVecs,
+    const int32_t binaryCodeBytes,
+    float* results) {
+
+    const uint8_t* plane0 = queryPtr;
+    const uint8_t* plane1 = queryPtr + binaryCodeBytes;
+    const uint8_t* plane2 = queryPtr + 2 * binaryCodeBytes;
+    const uint8_t* plane3 = queryPtr + 3 * binaryCodeBytes;
+
+    const int32_t words = binaryCodeBytes >> 3;
+    const int32_t remainStart = words * 8;
+
+    for (int32_t b = 0 ; b < BATCH_SIZE ; ++b) {
+        int64_t acc = 0;
+        const uint8_t* data = dataVecs[b];
+
+        // 8-byte word loop — compiler can auto-vectorize this
+        for (int32_t w = 0 ; w < words ; ++w) {
+            const int32_t offset = w * 8;
+            uint64_t dWord;
+            std::memcpy(&dWord, data + offset, sizeof(uint64_t));
+
+            uint64_t q0, q1, q2, q3;
+            std::memcpy(&q0, plane0 + offset, sizeof(uint64_t));
+            std::memcpy(&q1, plane1 + offset, sizeof(uint64_t));
+            std::memcpy(&q2, plane2 + offset, sizeof(uint64_t));
+            std::memcpy(&q3, plane3 + offset, sizeof(uint64_t));
+
+            acc += __builtin_popcountll(q0 & dWord) * 1
+                 + __builtin_popcountll(q1 & dWord) * 2
+                 + __builtin_popcountll(q2 & dWord) * 4
+                 + __builtin_popcountll(q3 & dWord) * 8;
+        }
+
+        // Byte remainder — handles non-multiple-of-8 binaryCodeBytes
+        for (int32_t r = remainStart ; r < binaryCodeBytes ; ++r) {
+            uint8_t db = data[r];
+            acc += __builtin_popcount((plane0[r] & db) & 0xFF) * 1
+                 + __builtin_popcount((plane1[r] & db) & 0xFF) * 2
+                 + __builtin_popcount((plane2[r] & db) & 0xFF) * 4
+                 + __builtin_popcount((plane3[r] & db) & 0xFF) * 8;
+        }
+
+        results[b] = static_cast<float>(acc);
+    }
+}
+
+template <bool IsMaxIP>
+struct DefaultBBQSimilarityFunction final : SimilarityFunction {
+    HOT_SPOT void calculateSimilarityInBulk(SimdVectorSearchContext* srchContext,
+                                            int32_t* internalVectorIds,
+                                            float* scores,
+                                            const int32_t numVectors) {
+        const auto* queryPtr = reinterpret_cast<const uint8_t*>(srchContext->queryVectorSimdAligned);
+        const int32_t dim = srchContext->dimension;
+        const int32_t binaryCodeBytes = (dim + 7) / 8;
+
+        // Read query correction factors from tmpBuffer
+        const auto* queryCorrectionPtr = reinterpret_cast<const float*>(srchContext->tmpBuffer.data());
+        const float ay = queryCorrectionPtr[0];
+        const float ly = (queryCorrectionPtr[1] - queryCorrectionPtr[0]) * FOUR_BIT_SCALE;
+        const float queryAdditional = queryCorrectionPtr[2];
+        int32_t y1Raw; std::memcpy(&y1Raw, &queryCorrectionPtr[3], sizeof(int32_t));
+        const float y1 = static_cast<float>(y1Raw);
+        const float centroidDp = queryCorrectionPtr[4];
+
+        int32_t processedCount = 0;
+        constexpr int32_t vecBlock = 8;
+        constexpr int32_t vecHalfBlock = 4;
+        uint8_t* vectors[vecBlock];
+
+        // Batch size 8
+        for ( ; (processedCount + vecBlock) <= numVectors ; processedCount += vecBlock) {
+            srchContext->getVectorPointersInBulk(vectors, &internalVectorIds[processedCount], vecBlock);
+            default4bitDotProductBatch<vecBlock>(queryPtr, vectors, binaryCodeBytes, &scores[processedCount]);
+
+            for (int32_t i = 0 ; i < vecBlock ; ++i) {
+                float ax, lx, additional, x1;
+                readDataCorrections(vectors[i] + binaryCodeBytes, ax, lx, additional, x1);
+
+                scores[processedCount + i] = ax * ay * dim
+                                           + ay * lx * x1
+                                           + ax * ly * y1
+                                           + lx * ly * scores[processedCount + i];
+
+                if constexpr (IsMaxIP) {
+                    scores[processedCount + i] += queryAdditional + additional - centroidDp;
+                } else {
+                    scores[processedCount + i] = std::max(0.0F, queryAdditional + additional - 2 * scores[processedCount + i]);
+                }
+            }
+        }
+
+        // Batch size 4
+        for ( ; (processedCount + vecHalfBlock) <= numVectors ; processedCount += vecHalfBlock) {
+            srchContext->getVectorPointersInBulk(vectors, &internalVectorIds[processedCount], vecHalfBlock);
+            default4bitDotProductBatch<vecHalfBlock>(queryPtr, vectors, binaryCodeBytes, &scores[processedCount]);
+
+            for (int32_t i = 0 ; i < vecHalfBlock ; ++i) {
+                float ax, lx, additional, x1;
+                readDataCorrections(vectors[i] + binaryCodeBytes, ax, lx, additional, x1);
+
+                scores[processedCount + i] = ax * ay * dim
+                                           + ay * lx * x1
+                                           + ax * ly * y1
+                                           + lx * ly * scores[processedCount + i];
+
+                if constexpr (IsMaxIP) {
+                    scores[processedCount + i] += queryAdditional + additional - centroidDp;
+                } else {
+                    scores[processedCount + i] =
+                        std::max(0.0F, queryAdditional + additional - 2 * scores[processedCount + i]);
+                }
+            }
+        }
+
+        // Tail: remaining vectors (scalar)
+        for ( ; processedCount < numVectors ; ++processedCount) {
+            const auto* dataVec = srchContext->getVectorPointer(internalVectorIds[processedCount]);
+            const float qcDist = static_cast<float>(
+                int4BitDotProduct(queryPtr, dataVec, binaryCodeBytes));
+
+            float ax, lx, additional, x1;
+            readDataCorrections(dataVec + binaryCodeBytes, ax, lx, additional, x1);
+
+            scores[processedCount] = ax * ay * dim
+                                   + ay * lx * x1
+                                   + ax * ly * y1
+                                   + lx * ly * qcDist;
+
+            if constexpr (IsMaxIP) {
+                scores[processedCount] += queryAdditional + additional - centroidDp;
+            } else {
+                scores[processedCount] =
+                    std::max(0.0F, queryAdditional + additional - 2 * scores[processedCount]);
+            }
+        }
+
+        if constexpr (IsMaxIP) {
+            FaissScoreToLuceneScoreTransform::ipToMaxIpTransformBulk(scores, numVectors);
+        } else {
+            FaissScoreToLuceneScoreTransform::l2TransformBulk(scores, numVectors);
+        }
+    }
+
+    float calculateSimilarity(SimdVectorSearchContext* srchContext, const int32_t internalVectorId) {
+        const auto* queryPtr = reinterpret_cast<const uint8_t*>(srchContext->queryVectorSimdAligned);
+        const int32_t dim = srchContext->dimension;
+        const int32_t binaryCodeBytes = (dim + 7) / 8;
+
+        const auto* queryCorrectionPtr = reinterpret_cast<const float*>(srchContext->tmpBuffer.data());
+        const float ay = queryCorrectionPtr[0];
+        const float ly = (queryCorrectionPtr[1] - queryCorrectionPtr[0]) * FOUR_BIT_SCALE;
+        const float queryAdditional = queryCorrectionPtr[2];
+        int32_t y1Raw; std::memcpy(&y1Raw, &queryCorrectionPtr[3], sizeof(int32_t));
+        const float y1 = static_cast<float>(y1Raw);
+        const float centroidDp = queryCorrectionPtr[4];
+
+        const auto* dataVec = srchContext->getVectorPointer(internalVectorId);
+        const float qcDist = static_cast<float>(
+            int4BitDotProduct(queryPtr, dataVec, binaryCodeBytes));
+
+        float ax, lx, additional, x1;
+        readDataCorrections(dataVec + binaryCodeBytes, ax, lx, additional, x1);
+
+        float score = ax * ay * dim
+                    + ay * lx * x1
+                    + ax * ly * y1
+                    + lx * ly * qcDist;
+
+        if constexpr (IsMaxIP) {
+            score += queryAdditional + additional - centroidDp;
+            return FaissScoreToLuceneScoreTransform::ipToMaxIpTransform(score);
+        } else {
+            score = std::max(0.0F, queryAdditional + additional - 2 * score);
+            return FaissScoreToLuceneScoreTransform::l2Transform(score);
+        }
+    }
+};
+
+//
+// BBQ
+//
+// 1. Max IP
+DefaultBBQSimilarityFunction<true> BBQ_IP_SIMIL_FUNC;
+// 2. L2
+DefaultBBQSimilarityFunction<false> BBQ_L2_SIMIL_FUNC;
+
 #ifndef __NO_SELECT_FUNCTION
 SimilarityFunction* SimilarityFunction::selectSimilarityFunction(const NativeSimilarityFunctionType nativeFunctionType) {
     if (nativeFunctionType == NativeSimilarityFunctionType::FP16_MAXIMUM_INNER_PRODUCT) {
         return &DEFAULT_FP16_MAX_INNER_PRODUCT_SIMIL_FUNC;
     } else if (nativeFunctionType == NativeSimilarityFunctionType::FP16_L2) {
         return &DEFAULT_FP16_L2_SIMIL_FUNC;
+    } else if (nativeFunctionType == NativeSimilarityFunctionType::BBQ_IP) {
+        return &BBQ_IP_SIMIL_FUNC;
+    } else if (nativeFunctionType == NativeSimilarityFunctionType::BBQ_L2) {
+        return &BBQ_L2_SIMIL_FUNC;
     }
 
     throw std::runtime_error("Invalid native similarity function type was given, nativeFunctionType="