Skip to content

Commit 5ec769a

Browse files
dkaczynskiravishankarDavid KaczynskiSiddharth GollapudiNeelamMahapatro
authored
Rebasing main's latest commits onto ravi/filter_support_rebased (#225)
- add code for two variants of filtered index, readme and CI tests - add utils for synthetic label generation and CI tests. * Add co-authors Co-authored-by: ravishankar <[email protected]> Co-authored-by: Varun Sivashankar <[email protected]> --------- Co-authored-by: ravishankar <[email protected]> Co-authored-by: David Kaczynski <[email protected]> Co-authored-by: Siddharth Gollapudi <[email protected]> Co-authored-by: Neelam Mahapatro <[email protected]> Co-authored-by: Harsha Vardhan Simhadri <[email protected]> Co-authored-by: Harsha Vardhan Simhadri <[email protected]> Co-authored-by: REDMOND\patelyash <[email protected]> Co-authored-by: Varun Sivashankar <[email protected]>
1 parent 5ba6a5d commit 5ec769a

33 files changed

+4149
-893
lines changed

.github/workflows/pr-test.yml

Lines changed: 40 additions & 2 deletions
Large diffs are not rendered by default.

CMakeSettings.json

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
{
2+
"configurations": [
3+
{
4+
"name": "x64-Release",
5+
"generator": "Ninja",
6+
"configurationType": "Release",
7+
"inheritEnvironments": [ "msvc_x64" ],
8+
"buildRoot": "${projectDir}\\out\\build\\${name}",
9+
"installRoot": "${projectDir}\\out\\install\\${name}",
10+
"cmakeCommandArgs": "",
11+
"buildCommandArgs": "",
12+
"ctestCommandArgs": ""
13+
},
14+
{
15+
"name": "WSL-GCC-Release",
16+
"generator": "Ninja",
17+
"configurationType": "RelWithDebInfo",
18+
"buildRoot": "${projectDir}\\out\\build\\${name}",
19+
"installRoot": "${projectDir}\\out\\install\\${name}",
20+
"cmakeExecutable": "cmake",
21+
"cmakeCommandArgs": "",
22+
"buildCommandArgs": "",
23+
"ctestCommandArgs": "",
24+
"inheritEnvironments": [ "linux_x64" ],
25+
"wslPath": "${defaultWSLPath}"
26+
}
27+
]
28+
}

README.md

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -87,4 +87,17 @@ Please see the following pages on using the compiled code:
8787
- [Commandline interface for building and search SSD based indices](workflows/SSD_index.md)
8888
- [Commandline interface for building and search in memory indices](workflows/in_memory_index.md)
8989
- [Commandline examples for using in-memory streaming indices](workflows/dynamic_index.md)
90+
- [Commandline interface for building and search in memory indices with label data and filters](workflows/filtered_in_memory.md)
9091
- To be added: Python interfaces and docker files
92+
93+
Please cite this software in your work as:
94+
95+
```
96+
@misc{diskann-github,
97+
author = {Simhadri, Harsha Vardhan and Krishnaswamy, Ravishankar and Srinivasa, Gopal and Subramanya, Suhas Jayaram and Antonijevic, Andrija and Pryce, Dax and Kaczynski, David and Williams, Shane and Gollapudi, Siddarth and Sivashankar, Varun and Karia, Neel and Singh, Aditi and Jaiswal, Shikhar and Mahapatro, Neelam and Adams, Philip and Tower, Bryan}},
98+
title = {{DiskANN: Scalable, efficient and Feature-rich ANNS}},
99+
url = {https://github.com/Microsoft/DiskANN},
100+
version = {0.5},
101+
year = {2023}
102+
}
103+
```

include/disk_utils.h

Lines changed: 30 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@ namespace diskann {
4040
const uint32_t WARMUP_L = 20;
4141
const uint32_t NUM_KMEANS_REPS = 12;
4242

43-
template<typename T>
43+
template<typename T, typename LabelT>
4444
class PQFlashIndex;
4545

4646
DISKANN_DLLEXPORT double get_memory_budget(const std::string &mem_budget_str);
@@ -68,38 +68,47 @@ namespace diskann {
6868
uint64_t warmup_aligned_dim);
6969
#endif
7070

71-
DISKANN_DLLEXPORT int merge_shards(const std::string &vamana_prefix,
72-
const std::string &vamana_suffix,
73-
const std::string &idmaps_prefix,
74-
const std::string &idmaps_suffix,
75-
const _u64 nshards, unsigned max_degree,
76-
const std::string &output_vamana,
77-
const std::string &medoids_file);
71+
DISKANN_DLLEXPORT int merge_shards(
72+
const std::string &vamana_prefix, const std::string &vamana_suffix,
73+
const std::string &idmaps_prefix, const std::string &idmaps_suffix,
74+
const _u64 nshards, unsigned max_degree, const std::string &output_vamana,
75+
const std::string &medoids_file, bool use_filters = false,
76+
const std::string &labels_to_medoids_file = std::string(""));
77+
78+
DISKANN_DLLEXPORT void extract_shard_labels(
79+
const std::string &in_label_file, const std::string &shard_ids_bin,
80+
const std::string &shard_label_file);
7881

7982
template<typename T>
8083
DISKANN_DLLEXPORT std::string preprocess_base_file(
8184
const std::string &infile, const std::string &indexPrefix,
8285
diskann::Metric &distMetric);
8386

84-
template<typename T>
87+
template<typename T, typename LabelT = uint32_t>
8588
DISKANN_DLLEXPORT int build_merged_vamana_index(
8689
std::string base_file, diskann::Metric _compareMetric, unsigned L,
8790
unsigned R, double sampling_rate, double ram_budget,
8891
std::string mem_index_path, std::string medoids_file,
89-
std::string centroids_file, size_t build_pq_bytes, bool use_opq);
92+
std::string centroids_file, size_t build_pq_bytes, bool use_opq,
93+
bool use_filters = false, const std::string &label_file = std::string(""),
94+
const std::string &labels_to_medoids_file = std::string(""),
95+
const std::string &universal_label = "", const _u32 Lf = 0);
9096

91-
template<typename T>
97+
template<typename T, typename LabelT>
9298
DISKANN_DLLEXPORT uint32_t optimize_beamwidth(
93-
std::unique_ptr<diskann::PQFlashIndex<T>> &_pFlashIndex, T *tuning_sample,
94-
_u64 tuning_sample_num, _u64 tuning_sample_aligned_dim, uint32_t L,
95-
uint32_t nthreads, uint32_t start_bw = 2);
96-
97-
template<typename T>
98-
DISKANN_DLLEXPORT int build_disk_index(const char *dataFilePath,
99-
const char *indexFilePath,
100-
const char *indexBuildParameters,
101-
diskann::Metric _compareMetric,
102-
bool use_opq = false);
99+
std::unique_ptr<diskann::PQFlashIndex<T, LabelT>> &_pFlashIndex,
100+
T *tuning_sample, _u64 tuning_sample_num, _u64 tuning_sample_aligned_dim,
101+
uint32_t L, uint32_t nthreads, uint32_t start_bw = 2);
102+
103+
template<typename T, typename LabelT = uint32_t>
104+
DISKANN_DLLEXPORT int build_disk_index(
105+
const char *dataFilePath, const char *indexFilePath,
106+
const char *indexBuildParameters, diskann::Metric _compareMetric,
107+
bool use_opq = false, bool use_filters = false,
108+
const std::string &label_file =
109+
std::string(""), // default is empty string for no label_file
110+
const std::string &universal_label = "", const _u32 filter_threshold = 0,
111+
const _u32 Lf = 0); // default is empty string for no universal label
103112

104113
template<typename T>
105114
DISKANN_DLLEXPORT void create_disk_layout(

include/index.h

Lines changed: 46 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424
#define DEFAULT_MAXC 750
2525

2626
namespace diskann {
27+
2728
inline double estimate_ram_usage(_u64 size, _u32 dim, _u32 datasize,
2829
_u32 degree) {
2930
double size_of_data = ((double) size) * ROUND_UP(dim, 8) * datasize;
@@ -60,7 +61,7 @@ namespace diskann {
6061
}
6162
};
6263

63-
template<typename T, typename TagT = uint32_t>
64+
template<typename T, typename TagT = uint32_t, typename LabelT = uint32_t>
6465
class Index {
6566
/**************************************************************************
6667
*
@@ -129,6 +130,17 @@ namespace diskann {
129130
Parameters &parameters,
130131
const std::vector<TagT> &tags);
131132

133+
// Filtered Support
134+
DISKANN_DLLEXPORT void build_filtered_index(
135+
const char *filename, const std::string &label_file,
136+
const size_t num_points_to_load, Parameters &parameters,
137+
const std::vector<TagT> &tags = std::vector<TagT>());
138+
139+
DISKANN_DLLEXPORT void set_universal_label(const LabelT &label);
140+
141+
// Get converted integer label from string to int map (_label_map)
142+
DISKANN_DLLEXPORT LabelT get_converted_label(const std::string &raw_label);
143+
132144
// Set starting point of an index before inserting any points incrementally
133145
DISKANN_DLLEXPORT void set_start_point(T *data);
134146
// Set starting point to a random point on a sphere of certain radius
@@ -155,6 +167,12 @@ namespace diskann {
155167
float *distances,
156168
std::vector<T *> &res_vectors);
157169

170+
// Filter support search
171+
template<typename IndexType>
172+
DISKANN_DLLEXPORT std::pair<uint32_t, uint32_t> search_with_filters(
173+
const T *query, const LabelT &filter_label, const size_t K,
174+
const unsigned L, IndexType *indices, float *distances);
175+
158176
// Will fail if tag already in the index or if tag=0.
159177
DISKANN_DLLEXPORT int insert_point(const T *point, const TagT tag);
160178

@@ -177,6 +195,8 @@ namespace diskann {
177195
DISKANN_DLLEXPORT consolidation_report
178196
consolidate_deletes(const Parameters &parameters);
179197

198+
DISKANN_DLLEXPORT void prune_all_nbrs(const Parameters &parameters);
199+
180200
DISKANN_DLLEXPORT bool is_index_saved();
181201

182202
// repositions frozen points to the end of _data - if they have been moved
@@ -208,8 +228,8 @@ namespace diskann {
208228

209229
protected:
210230
// No copy/assign.
211-
Index(const Index<T, TagT> &) = delete;
212-
Index<T, TagT> &operator=(const Index<T, TagT> &) = delete;
231+
Index(const Index<T, TagT, LabelT> &) = delete;
232+
Index<T, TagT, LabelT> &operator=(const Index<T, TagT, LabelT> &) = delete;
213233

214234
// Use after _data and _nd have been populated
215235
// Acquire exclusive _update_lock before calling
@@ -223,14 +243,23 @@ namespace diskann {
223243
// determines navigating node of the graph by calculating medoid of datafopt
224244
unsigned calculate_entry_point();
225245

246+
void parse_label_file(const std::string &label_file,
247+
size_t &num_pts_labels);
248+
249+
std::unordered_map<std::string, LabelT> load_label_map(
250+
const std::string &map_file);
251+
226252
std::pair<uint32_t, uint32_t> iterate_to_fixed_point(
227253
const T *node_coords, const unsigned Lindex,
228254
const std::vector<unsigned> &init_ids, InMemQueryScratch<T> *scratch,
255+
bool use_filter, const std::vector<LabelT> &filters,
229256
bool ret_frozen = true, bool search_invocation = false);
230257

231258
void search_for_point_and_prune(int location, _u32 Lindex,
232259
std::vector<unsigned> &pruned_list,
233-
InMemQueryScratch<T> *scratch);
260+
InMemQueryScratch<T> *scratch,
261+
bool use_filter = false,
262+
_u32 filteredLindex = 0);
234263

235264
void prune_neighbors(const unsigned location, std::vector<Neighbor> &pool,
236265
std::vector<unsigned> &pruned_list,
@@ -342,6 +371,19 @@ namespace diskann {
342371
bool _enable_tags = false;
343372
bool _normalize_vecs = false; // Using normalied L2 for cosine.
344373

374+
// Filter Support
375+
376+
bool _filtered_index = false;
377+
std::vector<std::vector<LabelT>> _pts_to_labels;
378+
tsl::robin_set<LabelT> _labels;
379+
std::string _labels_file;
380+
std::unordered_map<LabelT, _u32> _label_to_medoid_id;
381+
std::unordered_map<_u32, _u32> _medoid_counts;
382+
bool _use_universal_label = false;
383+
LabelT _universal_label = 0;
384+
uint32_t _filterIndexingQueueSize;
385+
std::unordered_map<std::string, LabelT> _label_map;
386+
345387
// Indexing parameters
346388
uint32_t _indexingQueueSize;
347389
uint32_t _indexingRange;

include/pq_flash_index.h

Lines changed: 44 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@
2020

2121
namespace diskann {
2222

23-
template<typename T>
23+
template<typename T, typename LabelT = uint32_t>
2424
class PQFlashIndex {
2525
public:
2626
DISKANN_DLLEXPORT PQFlashIndex(
@@ -70,11 +70,26 @@ namespace diskann {
7070
float *res_dists, const _u64 beam_width,
7171
const bool use_reorder_data = false, QueryStats *stats = nullptr);
7272

73+
DISKANN_DLLEXPORT void cached_beam_search(
74+
const T *query, const _u64 k_search, const _u64 l_search, _u64 *res_ids,
75+
float *res_dists, const _u64 beam_width, const bool use_filter,
76+
const LabelT &filter_label, const bool use_reorder_data = false,
77+
QueryStats *stats = nullptr);
78+
7379
DISKANN_DLLEXPORT void cached_beam_search(
7480
const T *query, const _u64 k_search, const _u64 l_search, _u64 *res_ids,
7581
float *res_dists, const _u64 beam_width, const _u32 io_limit,
7682
const bool use_reorder_data = false, QueryStats *stats = nullptr);
7783

84+
DISKANN_DLLEXPORT void cached_beam_search(
85+
const T *query, const _u64 k_search, const _u64 l_search, _u64 *res_ids,
86+
float *res_dists, const _u64 beam_width, const bool use_filter,
87+
const LabelT &filter_label, const _u32 io_limit,
88+
const bool use_reorder_data = false, QueryStats *stats = nullptr);
89+
90+
DISKANN_DLLEXPORT LabelT
91+
get_converted_label(const std::string &filter_label);
92+
7893
DISKANN_DLLEXPORT _u32 range_search(const T *query1, const double range,
7994
const _u64 min_l_search,
8095
const _u64 max_l_search,
@@ -94,12 +109,26 @@ namespace diskann {
94109
DISKANN_DLLEXPORT void setup_thread_data(_u64 nthreads,
95110
_u64 visited_reserve = 4096);
96111

112+
DISKANN_DLLEXPORT void set_universal_label(const LabelT &label);
113+
97114
private:
115+
DISKANN_DLLEXPORT inline bool point_has_label(_u32 point_id, _u32 label_id);
116+
std::unordered_map<std::string, LabelT> load_label_map(
117+
const std::string &map_file);
118+
DISKANN_DLLEXPORT void parse_label_file(const std::string &map_file,
119+
size_t &num_pts_labels);
120+
DISKANN_DLLEXPORT void get_label_file_metadata(std::string map_file,
121+
_u32 &num_pts,
122+
_u32 &num_total_labels);
123+
DISKANN_DLLEXPORT inline int32_t get_filter_number(
124+
const LabelT &filter_label);
125+
98126
// index info
99127
// nhood of node `i` is in sector: [i / nnodes_per_sector]
100128
// offset in sector: [(i % nnodes_per_sector) * max_node_len]
101129
// nnbrs of node `i`: *(unsigned*) (buf)
102130
// nbrs of node `i`: ((unsigned*)buf) + 1
131+
103132
_u64 max_node_len = 0, nnodes_per_sector = 0, max_degree = 0;
104133

105134
// Data used for searching with re-order vectors
@@ -171,6 +200,20 @@ namespace diskann {
171200
bool reorder_data_exists = false;
172201
_u64 reoreder_data_offset = 0;
173202

203+
// filter support
204+
_u32 *_pts_to_label_offsets = nullptr;
205+
_u32 *_pts_to_labels = nullptr;
206+
tsl::robin_set<LabelT> _labels;
207+
std::unordered_map<LabelT, _u32> _filter_to_medoid_id;
208+
bool _use_universal_label;
209+
_u32 _universal_filter_num;
210+
std::vector<LabelT> _filter_list;
211+
tsl::robin_set<_u32> _dummy_pts;
212+
tsl::robin_set<_u32> _has_dummy_pts;
213+
tsl::robin_map<_u32, _u32> _dummy_to_real_map;
214+
tsl::robin_map<_u32, std::vector<_u32>> _real_to_dummy_map;
215+
std::unordered_map<std::string, LabelT> _label_map;
216+
174217
#ifdef EXEC_ENV_OLS
175218
// Set to a larger value than the actual header to accommodate
176219
// any additions we make to the header. This is an outer limit

0 commit comments

Comments
 (0)