Skip to content

Commit

Permalink
*: support vector search (#9486)
Browse files Browse the repository at this point in the history
close #9032

*: support vector search

Co-authored-by: JaySon <[email protected]>
Co-authored-by: JaySon-Huang <[email protected]>
  • Loading branch information
Lloyd-Pottiger and JaySon-Huang authored Sep 30, 2024
1 parent 69dd613 commit 2a198f1
Show file tree
Hide file tree
Showing 210 changed files with 16,885 additions and 594 deletions.
9 changes: 9 additions & 0 deletions .gitmodules
Original file line number Diff line number Diff line change
Expand Up @@ -146,3 +146,12 @@
[submodule "contrib/not_null"]
path = contrib/not_null
url = https://github.com/bitwizeshift/not_null.git
[submodule "contrib/usearch"]
path = contrib/usearch
url = https://github.com/unum-cloud/usearch.git
[submodule "contrib/simsimd"]
path = contrib/simsimd
url = https://github.com/ashvardanian/SimSIMD
[submodule "contrib/highfive"]
path = contrib/highfive
url = https://github.com/BlueBrain/HighFive
2 changes: 1 addition & 1 deletion cmake/cpu_features.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,7 @@ elseif (ARCH_AMD64)
# so we do not set the flags to avoid core dump in old machines
option (TIFLASH_ENABLE_AVX_SUPPORT "Use AVX/AVX2 instructions on x86_64" ON)
option (TIFLASH_ENABLE_AVX512_SUPPORT "Use AVX512 instructions on x86_64" ON)

# `haswell` was released since 2013 with cpu feature avx2, bmi2. It's a practical arch for optimizer
option (TIFLASH_ENABLE_ARCH_HASWELL_SUPPORT "Use instructions based on architecture `haswell` on x86_64" ON)

Expand Down
9 changes: 9 additions & 0 deletions contrib/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -198,3 +198,12 @@ add_subdirectory(aws-cmake)
add_subdirectory(simdjson)

add_subdirectory(fastpforlib)

add_subdirectory(usearch-cmake)

add_subdirectory(simsimd-cmake)

if (ENABLE_TESTS AND NOT CMAKE_BUILD_TYPE_UC STREQUAL "DEBUG")
add_subdirectory(hdf5-cmake)
add_subdirectory(highfive-cmake)
endif ()
1 change: 1 addition & 0 deletions contrib/hdf5-cmake/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
/download/*
41 changes: 41 additions & 0 deletions contrib/hdf5-cmake/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
include(ExternalProject)

# hdf5 is too large. Instead of adding as a submodule, let's simply download from GitHub.
ExternalProject_Add(hdf5-external
PREFIX ${CMAKE_CURRENT_BINARY_DIR}
DOWNLOAD_DIR ${TiFlash_SOURCE_DIR}/contrib/hdf5-cmake/download
URL https://github.com/HDFGroup/hdf5/archive/refs/tags/hdf5_1.14.4.3.zip
URL_HASH MD5=bc987d22e787290127aacd7b99b4f31e
CMAKE_ARGS
-DCMAKE_BUILD_TYPE=Release
-DCMAKE_INSTALL_PREFIX=<INSTALL_DIR>
-DBUILD_STATIC_LIBS=ON
-DBUILD_SHARED_LIBS=OFF
-DBUILD_TESTING=OFF
-DHDF5_BUILD_HL_LIB=OFF
-DHDF5_BUILD_TOOLS=OFF
-DHDF5_BUILD_CPP_LIB=ON
-DHDF5_BUILD_EXAMPLES=OFF
-DHDF5_ENABLE_Z_LIB_SUPPORT=OFF
-DHDF5_ENABLE_SZIP_SUPPORT=OFF
BUILD_BYPRODUCTS <INSTALL_DIR>/lib/${CMAKE_FIND_LIBRARY_PREFIXES}hdf5.a # Workaround for Ninja
USES_TERMINAL_DOWNLOAD TRUE
USES_TERMINAL_CONFIGURE TRUE
USES_TERMINAL_BUILD TRUE
USES_TERMINAL_INSTALL TRUE
EXCLUDE_FROM_ALL TRUE
DOWNLOAD_EXTRACT_TIMESTAMP TRUE
)

ExternalProject_Get_Property(hdf5-external INSTALL_DIR)

add_library(tiflash_contrib::hdf5 STATIC IMPORTED GLOBAL)
set_target_properties(tiflash_contrib::hdf5 PROPERTIES
IMPORTED_LOCATION ${INSTALL_DIR}/lib/${CMAKE_FIND_LIBRARY_PREFIXES}hdf5.a
)
add_dependencies(tiflash_contrib::hdf5 hdf5-external)

file(MAKE_DIRECTORY ${INSTALL_DIR}/include)
target_include_directories(tiflash_contrib::hdf5 SYSTEM INTERFACE
${INSTALL_DIR}/include
)
1 change: 1 addition & 0 deletions contrib/highfive
Submodule highfive added at 0d0259
18 changes: 18 additions & 0 deletions contrib/highfive-cmake/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
set(HIGHFIVE_PROJECT_DIR "${TiFlash_SOURCE_DIR}/contrib/highfive")
set(HIGHFIVE_SOURCE_DIR "${HIGHFIVE_PROJECT_DIR}/include")

if (NOT EXISTS "${HIGHFIVE_SOURCE_DIR}/highfive/highfive.hpp")
message (FATAL_ERROR "submodule contrib/highfive not found")
endif()

add_library(_highfive INTERFACE)

target_include_directories(_highfive SYSTEM INTERFACE
${HIGHFIVE_SOURCE_DIR}
)

target_link_libraries(_highfive INTERFACE
tiflash_contrib::hdf5
)

add_library(tiflash_contrib::highfive ALIAS _highfive)
1 change: 1 addition & 0 deletions contrib/simsimd
Submodule simsimd added at ff5143
13 changes: 13 additions & 0 deletions contrib/simsimd-cmake/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
set(SIMSIMD_PROJECT_DIR "${TiFlash_SOURCE_DIR}/contrib/simsimd")
set(SIMSIMD_SOURCE_DIR "${SIMSIMD_PROJECT_DIR}/include")

add_library(_simsimd INTERFACE)

if (NOT EXISTS "${SIMSIMD_SOURCE_DIR}/simsimd/simsimd.h")
message (FATAL_ERROR "submodule contrib/simsimd not found")
endif()

target_include_directories(_simsimd SYSTEM INTERFACE
${SIMSIMD_SOURCE_DIR})

add_library(tiflash_contrib::simsimd ALIAS _simsimd)
1 change: 1 addition & 0 deletions contrib/usearch
Submodule usearch added at 5ad205
15 changes: 15 additions & 0 deletions contrib/usearch-cmake/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
set(USEARCH_PROJECT_DIR "${TiFlash_SOURCE_DIR}/contrib/usearch")
set(USEARCH_SOURCE_DIR "${USEARCH_PROJECT_DIR}/include")

add_library(_usearch INTERFACE)

if (NOT EXISTS "${USEARCH_SOURCE_DIR}/usearch/index.hpp")
message (FATAL_ERROR "submodule contrib/usearch not found")
endif ()

target_include_directories(_usearch SYSTEM INTERFACE
# ${USEARCH_PROJECT_DIR}/simsimd/include # Use our simsimd
${USEARCH_PROJECT_DIR}/fp16/include
${USEARCH_SOURCE_DIR})

add_library(tiflash_contrib::usearch ALIAS _usearch)
34 changes: 31 additions & 3 deletions dbms/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,8 @@ add_headers_and_sources(dbms src/Client)
add_headers_only(dbms src/Flash/Coprocessor)
add_headers_only(dbms src/Server)

add_headers_and_sources(tiflash_vector_search src/VectorSearch)

check_then_add_sources_compile_flag (
TIFLASH_ENABLE_ARCH_HASWELL_SUPPORT
"${TIFLASH_COMPILER_ARCH_HASWELL_FLAG}"
Expand Down Expand Up @@ -203,12 +205,25 @@ target_link_libraries (tiflash_common_io
)

target_include_directories (tiflash_common_io BEFORE PRIVATE ${kvClient_SOURCE_DIR}/include)
target_compile_definitions(tiflash_common_io PUBLIC -DTIFLASH_SOURCE_PREFIX=\"${TiFlash_SOURCE_DIR}\")
target_compile_definitions (tiflash_common_io PUBLIC -DTIFLASH_SOURCE_PREFIX=\"${TiFlash_SOURCE_DIR}\")

add_library(tiflash_vector_search
${tiflash_vector_search_headers}
${tiflash_vector_search_sources}
)
target_link_libraries(tiflash_vector_search
tiflash_contrib::usearch
tiflash_contrib::simsimd

fmt
)

target_link_libraries (dbms
${OPENSSL_CRYPTO_LIBRARY}
${BTRIE_LIBRARIES}
absl::synchronization
tiflash_contrib::aws_s3
tiflash_vector_search

etcdpb
tiflash_parsers
Expand Down Expand Up @@ -362,7 +377,6 @@ if (ENABLE_TESTS)
add_check(gtests_dbms)

add_target_pch("pch-dbms.h" gtests_dbms)

grep_bench_sources(${TiFlash_SOURCE_DIR}/dbms dbms_bench_sources)
add_executable(bench_dbms EXCLUDE_FROM_ALL
${dbms_bench_sources}
Expand All @@ -373,7 +387,21 @@ if (ENABLE_TESTS)
)
target_include_directories(bench_dbms BEFORE PRIVATE ${SPARCEHASH_INCLUDE_DIR} ${benchmark_SOURCE_DIR}/include)
target_compile_definitions(bench_dbms PUBLIC DBMS_PUBLIC_GTEST)
target_link_libraries(bench_dbms gtest dbms test_util_bench_main benchmark tiflash_functions server_for_test delta_merge kvstore tiflash_aggregate_functions)
target_link_libraries(bench_dbms
gtest
benchmark

dbms
test_util_bench_main
tiflash_functions
server_for_test
delta_merge
tiflash_aggregate_functions
kvstore)

if (NOT CMAKE_BUILD_TYPE_UC STREQUAL "DEBUG")
target_link_libraries(bench_dbms tiflash_contrib::highfive)
endif()

add_check(bench_dbms)
endif ()
Expand Down
8 changes: 8 additions & 0 deletions dbms/src/Columns/ColumnArray.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -352,6 +352,14 @@ void ColumnArray::insertDefault()
getOffsets().push_back(getOffsets().empty() ? 0 : getOffsets().back());
}

void ColumnArray::insertManyDefaults(size_t length)
{
auto & offsets = getOffsets();
size_t v = 0;
if (!offsets.empty())
v = offsets.back();
offsets.resize_fill(offsets.size() + length, v);
}

void ColumnArray::popBack(size_t n)
{
Expand Down
16 changes: 6 additions & 10 deletions dbms/src/Columns/ColumnArray.h
Original file line number Diff line number Diff line change
Expand Up @@ -103,11 +103,7 @@ class ColumnArray final : public COWPtrHelper<IColumn, ColumnArray>
}

void insertDefault() override;
void insertManyDefaults(size_t length) override
{
for (size_t i = 0; i < length; ++i)
insertDefault();
}
void insertManyDefaults(size_t length) override;
void popBack(size_t n) override;
/// TODO: If result_size_hint < 0, makes reserve() using size of filtered column, not source column to avoid some OOM issues.
ColumnPtr filter(const Filter & filt, ssize_t result_size_hint) const override;
Expand Down Expand Up @@ -176,16 +172,16 @@ class ColumnArray final : public COWPtrHelper<IColumn, ColumnArray>

std::pair<UInt32, StringRef> getElementRef(size_t element_idx) const;

private:
ColumnPtr data;
ColumnPtr offsets;

size_t ALWAYS_INLINE offsetAt(size_t i) const { return i == 0 ? 0 : getOffsets()[i - 1]; }
size_t ALWAYS_INLINE sizeAt(size_t i) const
{
return i == 0 ? getOffsets()[0] : (getOffsets()[i] - getOffsets()[i - 1]);
}

private:
ColumnPtr data;
ColumnPtr offsets;

size_t ALWAYS_INLINE offsetAt(size_t i) const { return i == 0 ? 0 : getOffsets()[i - 1]; }

/// Multiply values if the nested column is ColumnVector<T>.
template <typename T>
Expand Down
1 change: 1 addition & 0 deletions dbms/src/Common/CurrentMetrics.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@
M(DT_SnapshotOfReadRaw) \
M(DT_SnapshotOfSegmentSplit) \
M(DT_SnapshotOfSegmentMerge) \
M(DT_SnapshotOfSegmentIngestIndex) \
M(DT_SnapshotOfSegmentIngest) \
M(DT_SnapshotOfDeltaMerge) \
M(DT_SnapshotOfDeltaCompact) \
Expand Down
7 changes: 6 additions & 1 deletion dbms/src/Common/FailPoint.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,11 @@ namespace DB
M(force_fail_to_create_etcd_session) \
M(force_remote_read_for_batch_cop_once) \
M(exception_new_dynamic_thread) \
M(force_wait_index_timeout)
M(force_wait_index_timeout) \
M(force_local_index_task_memory_limit_exceeded) \
M(exception_build_local_index_for_file) \
M(force_not_support_vector_index) \
M(sync_schema_request_failure)

#define APPLY_FOR_FAILPOINTS(M) \
M(skip_check_segment_update) \
Expand Down Expand Up @@ -106,6 +110,7 @@ namespace DB
M(proactive_flush_force_set_type) \
M(exception_when_fetch_disagg_pages) \
M(cop_send_failure) \
M(file_cache_fg_download_fail) \
M(force_set_parallel_prehandle_threshold) \
M(force_raise_prehandle_exception) \
M(force_agg_on_partial_block) \
Expand Down
8 changes: 8 additions & 0 deletions dbms/src/Common/LRUCache.h
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,14 @@ class LRUCache
return res;
}

/// Returns whether a specific key is in the LRU cache
/// without updating the LRU order.
bool contains(const Key & key)
{
std::lock_guard cache_lock(mutex);
return cells.contains(key);
}

void set(const Key & key, const MappedPtr & mapped)
{
std::scoped_lock cache_lock(mutex);
Expand Down
13 changes: 13 additions & 0 deletions dbms/src/Common/TiFlashBuildInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@
#include <Common/TiFlashBuildInfo.h>
#include <Common/config.h>
#include <Common/config_version.h>
#include <VectorSearch/DistanceSIMDFeatures.h>
#include <VectorSearch/SIMDFeatures.h>
#include <common/config_common.h>
#include <common/logger_useful.h>
#include <fmt/core.h>
Expand Down Expand Up @@ -140,6 +142,17 @@ String getEnabledFeatures()
"fdo",
#endif
};
{
auto f = DB::DM::VectorIndexHNSWSIMDFeatures::get();
for (const auto & feature : f)
features.push_back(feature);
}
{
auto f = DB::VectorDistanceSIMDFeatures::get();
for (const auto & feature : f)
features.push_back(feature);
}

return fmt::format("{}", fmt::join(features.begin(), features.end(), " "));
}
// clang-format on
Expand Down
17 changes: 17 additions & 0 deletions dbms/src/Common/TiFlashMetrics.h
Original file line number Diff line number Diff line change
Expand Up @@ -848,6 +848,23 @@ static_assert(RAFT_REGION_BIG_WRITE_THRES * 4 < RAFT_REGION_BIG_WRITE_MAX, "Inva
F(type_cop, {"type", "cop"}), \
F(type_cop_stream, {"type", "cop_stream"}), \
F(type_batch, {"type", "batch"}), ) \
M(tiflash_vector_index_memory_usage, \
"Vector index memory usage", \
Gauge, \
F(type_build, {"type", "build"}), \
F(type_view, {"type", "view"})) \
M(tiflash_vector_index_active_instances, \
"Active Vector index instances", \
Gauge, \
F(type_build, {"type", "build"}), \
F(type_view, {"type", "view"})) \
M(tiflash_vector_index_duration, \
"Vector index operation duration", \
Histogram, \
F(type_build, {{"type", "build"}}, ExpBuckets{0.001, 2, 20}), \
F(type_download, {{"type", "download"}}, ExpBuckets{0.001, 2, 20}), \
F(type_view, {{"type", "view"}}, ExpBuckets{0.001, 2, 20}), \
F(type_search, {{"type", "search"}}, ExpBuckets{0.001, 2, 20})) \
M(tiflash_storage_io_limiter_pending_count, \
"I/O limiter pending count", \
Counter, \
Expand Down
Loading

0 comments on commit 2a198f1

Please sign in to comment.