masajiro · Oct 18, 2023
diff --git a/‎benchmark/datasets.py
+30 b/‎benchmark/datasets.py
+30
diff --git a/‎benchmark/results.py
+3-6 b/‎benchmark/results.py
+3-6
diff --git a/‎benchmark/runner.py
+4-2 b/‎benchmark/runner.py
+4-2
diff --git a/‎benchmark/streaming/compute_gt.py
+4-1 b/‎benchmark/streaming/compute_gt.py
+4-1
diff --git a/‎data_export.py
+4-1 b/‎data_export.py
+4-1
diff --git a/‎neurips23/README.md
+8-3 b/‎neurips23/README.md
+8-3
diff --git a/‎neurips23/streaming/diskann/config.yaml
+15-1 b/‎neurips23/streaming/diskann/config.yaml
+15-1
diff --git a/‎neurips23/streaming/final_runbook.yaml
+3,843 b/‎neurips23/streaming/final_runbook.yaml
+3,843
diff --git a/‎neurips23/streaming/final_runbook_gen.py
+177 b/‎neurips23/streaming/final_runbook_gen.py
+177
diff --git a/‎neurips23/streaming/run.py
+10-7 b/‎neurips23/streaming/run.py
+10-7
diff --git a/‎requirements_py3.10.txt
+1-1 b/‎requirements_py3.10.txt
+1-1
@@ -193,6 +193,13 @@ def get_dataset_iterator(self, bs=512, split=(1,0)):
             j1 = min(j0 + bs, i1)
             yield sanitize(x[j0:j1])
 
+    def get_data_in_range(self, start, end):
+        assert start >= 0
+        assert end <= self.nb
+        filename = self.get_dataset_fn()
+        x = xbin_mmap(filename, dtype=self.dtype, maxn=self.nb)
+        return x[start:end]
+
     def search_type(self):
         return "knn"
 
@@ -434,6 +441,28 @@ def distance(self):
 
     def prepare(self, skip_data=False, original_size=10 ** 9):
         return super().prepare(skip_data, original_size = self.nb)
+    
+class MSTuringClustered30M(DatasetCompetitionFormat):
+    def __init__(self):
+        self.nb = 29998994
+        self.d = 100
+        self.nq = 10000
+        self.dtype = "float32"
+        self.ds_fn = "30M-clustered64.fbin"
+        self.qs_fn = "testQuery10K.fbin"
+        self.gt_fn = "clu_msturing30M_gt100"
+        
+        self.base_url = "https://comp21storage.blob.core.windows.net/publiccontainer/comp23/clustered_data/msturing-30M-clustered/"
+        self.basedir = os.path.join(BASEDIR, "MSTuring-30M-clustered")
+
+        self.private_gt_url = None
+        self.private_qs_url = None
+
+    def distance(self):
+        return "euclidean"
+    
+    def prepare(self, skip_data=False, original_size=10 ** 9):
+        return super().prepare(skip_data, original_size = self.nb)
 
 class MSSPACEV1B(DatasetCompetitionFormat):
     def __init__(self, nb_M=1000):
@@ -984,6 +1013,7 @@ def __str__(self):
     'msturing-1M': lambda : MSTuringANNS(1),
 
     'msturing-10M-clustered': lambda: MSTuringClustered10M(),
+    'msturing-30M-clustered': lambda: MSTuringClustered30M(),
 
     'msspacev-1B': lambda : MSSPACEV1B(1000),
     'msspacev-100M': lambda : MSSPACEV1B(100),
 
@@ -6,7 +6,6 @@
 import re
 import traceback
 
-
 def get_result_filename(dataset=None, count=None, definition=None,
                         query_arguments=None, neurips23track=None, runbook_path=None):
     d = ['results']
@@ -41,9 +40,7 @@ def get_result_filename(dataset=None, count=None, definition=None,
 
 def add_results_to_h5py(f, search_type, results, count, suffix = ''):
     if search_type == "knn" or search_type == "knn_filtered":
-        neighbors = f.create_dataset('neighbors' + suffix, (len(results), count), 'i')
-        for i, idxs in enumerate(results):
-            neighbors[i] = idxs
+        neighbors = f.create_dataset('neighbors' + suffix, (len(results), count), 'i', data = results)
     elif search_type == "range":
         lims, D, I= results
         f.create_dataset('neighbors' + suffix, data=I)
@@ -59,7 +56,7 @@ def store_results(dataset, count, definition, query_arguments,
     head, tail = os.path.split(fn)
     if not os.path.isdir(head):
         os.makedirs(head)
-    f = h5py.File(fn, 'w')
+    f = h5py.File(name=fn, mode='w', libver='latest')
     for k, v in attrs.items():
         f.attrs[k] = v
 
@@ -83,7 +80,7 @@ def load_all_results(dataset=None, count=None, neurips23track=None, runbook_path
             if os.path.splitext(fn)[-1] != '.hdf5':
                 continue
             try:
-                f = h5py.File(os.path.join(root, fn), 'r+')
+                f = h5py.File(name=os.path.join(root, fn), mode='r+', libver='latest')
                 properties = dict(f.attrs)
                 yield properties, f
                 f.close()
 
@@ -100,7 +100,7 @@ def run(definition, dataset, count, run_count, rebuild,
                     algo.set_query_arguments(*query_arguments)
                 if neurips23track == 'streaming':
                     descriptor, results = custom_runner.run_task(
-                        algo, ds, distance, 1, run_count, search_type, private_query, runbook)
+                        algo, ds, distance, count, 1, search_type, private_query, runbook)
                 else:
                     descriptor, results = custom_runner.run_task(
                         algo, ds, distance, count, run_count, search_type, private_query)
@@ -116,9 +116,11 @@ def run(definition, dataset, count, run_count, rebuild,
                         X = ds.get_private_queries()
                     power_stats = power_capture.run(algo, X, distance, count,
                                                     run_count, search_type, descriptor)
+                print('start store results')
                 store_results(dataset, count, definition,
                               query_arguments, descriptor,
                               results, search_type, neurips23track, runbook_path)
+                print('end store results')
     finally:
         algo.done()
 
@@ -263,7 +265,7 @@ def run_docker(definition, dataset, count, runs, timeout, rebuild,
 
     client = docker.from_env()
     if mem_limit is None:
-        mem_limit = psutil.virtual_memory().available
+        mem_limit = psutil.virtual_memory().available if neurips23track != 'streaming' else (8*1024*1024*1024)
 
     # ready the container object invoked later in this function
     container = None
 
@@ -26,7 +26,7 @@ def gt_dir(ds, runbook_path):
     return os.path.join(ds.basedir, str(ds.nb), runbook_filename)
 
 def output_gt(ds, ids, step, gt_cmdline, runbook_path):
-    data = ds.get_dataset()
+    data = ds.get_data_in_range(0, ds.nb)
     data_slice = data[ids]
 
     dir = gt_dir(ds, runbook_path)
@@ -52,6 +52,9 @@ def output_gt(ds, ids, step, gt_cmdline, runbook_path):
     gt_cmdline += ' --tags_file ' + tags_file
     print("Executing cmdline: ", gt_cmdline)
     os.system(gt_cmdline)
+    print("Removing data file")
+    rm_cmdline = "rm " + data_file
+    os.system(rm_cmdline)
 
 
 def main():
 
@@ -95,7 +95,10 @@ def cleaned_run_metric(run_metrics):
             print(f"Looking at track:{track}, dataset:{dataset_name}")
             dataset = DATASETS[dataset_name]()
             if track == 'streaming':
-                for runbook_path in ['neurips23/streaming/simple_runbook.yaml', 'neurips23/streaming/clustered_runbook.yaml', 'neurips23/streaming/delete_runbook.yaml']:
+                for runbook_path in ['neurips23/streaming/simple_runbook.yaml', 
+                                    'neurips23/streaming/clustered_runbook.yaml',
+                                    'neurips23/streaming/delete_runbook.yaml',
+                                    'neurips23/streaming/final_runbook.yaml']:
                     results = load_all_results(dataset_name, neurips23track=track, runbook_path=runbook_path)
                     run_metrics = compute_metrics_all_runs(dataset, dataset_name, results, args.recompute, \
                         args.sensors, args.search_times, args.private_query, \
 
@@ -23,7 +23,7 @@ The Practical Vector Search challenge at NeurIPS 2023 has four different tasks:
     The tags are from a vocabulary of 200386 possible tags. 
     The 100,000 queries consist of one image embedding and one or two tags that must appear in the database elements to be considered.
 
-**Task Streaming:** This task uses 10M slice of the MS Turing data set released in the previous challenge. The index starts with zero points and must implement the "runbook" provided - a sequence of insertion operations, deletion operations, and search commands (roughly 4:4:1 ratio) - within a time bound of 1 hour. In the final run, we will  use a different runbook, and possibly a different data set, to avoid participants over-fitting to this dataset. Entries will be ranked by average recall over queries at all check points. The intention is for the algorithm to  process the operations and maintain a compact index over the active points rather than index the entire anticipated set of points and use tombstones or flags to mark active elements.
+**Task Streaming:** This task uses 10M slice of the MS Turing data set released in the previous challenge. The index starts with zero points and must implement the "runbook" provided - a sequence of insertion operations, deletion operations, and search commands (roughly 4:4:1 ratio) - within a time bound of 1 hour and a DRAM limit of 8GB. Entries will be ranked by average recall over queries at all check points. The intention is for the algorithm to  process the operations and maintain a compact index over the active points rather than index the entire anticipated set of points and use tombstones or flags to mark active elements. ~~In the final run, we will  use a different runbook, and possibly a different data set, to avoid participants over-fitting to this dataset.~~ The final run will use `msturing-30M-clustered`, a 30M slice of the MSTuring dataset, and the `final_runbook.yaml` runbook. 
 
 **Task Out-Of-Distribution:**  Yandex Text-to-Image 10M represents a cross-modal dataset where the database and query vectors have different distributions in the shared vector space.
     The base set is a 10M subset of the Yandex visual search database of 200-dimensional image embeddings which are produced with the Se-ResNext-101 model. 
@@ -46,6 +46,7 @@ The baselines were run on an Azure Standard D8lds v5 (8 vcpus, 16 GiB memory) ma
 |Sparse   | Linear Scan | 101                         |  `python3 run.py --dataset sparse-full --algorithm linscan --neurips23track sparse` |
 |Filter   | faiss       | 3200                        | `python3 run.py --dataset yfcc-10M --algorithm faiss --neurips23track filter` |
 |Streaming| DiskANN     | 0.924 (recall@10), 23 mins  |  `python3 run.py --dataset msturing-10M-clustered --algorithm diskann --neurips23track streaming --runbook_path neurips23/streaming/delete_runbook.yaml` |
+|Streaming| DiskANN     | 0.883 (recall@10), 45 mins  |  `python3 run.py --dataset msturing-30M-clustered --algorithm diskann --neurips23track streaming --runbook_path neurips23/streaming/final_runbook.yaml` |
 |OOD      | DiskANN     | 4882                        | `python3 run.py --dataset text2image-10M --algorithm diskann --neurips23track ood` | 
 
 
@@ -110,13 +111,17 @@ For the competition dataset, run commands mentioned in the table above, for exam
 python run.py --neurips23track filter    --algorithm faiss   --dataset yfcc-10M
 python run.py --neurips23track sparse    --algorithm linscan --dataset sparse-full
 python run.py --neurips23track ood       --algorithm diskann --dataset text2image-10M
+# preliminary runbook for testing 
 python run.py --neurips23track streaming --algorithm diskann --dataset msturing-10M-clustered --runbook_path neurips23/streaming/delete_runbook.yaml
+#Final runbook for evaluation
+python run.py --neurips23track streaming --algorithm diskann --dataset msturing-30M-clustered --runbook_path neurips23/streaming/final_runbook.yaml
 ```
 
 For streaming track, runbook specifies the order of operations to be executed by the algorithms. To download the ground truth for every search operation: (needs azcopy tool in your binary path):
 ```
-python benchmark/streaming/download_gt.py --runbook_file neurips23/streaming/simple_runbook.yaml --dataset msspacev-10M 
-python benchmark/streaming/download_gt.py --runbook_file neurips23/streaming/delete_runbook.yaml --dataset msturing-10M-clustered 
+python -m benchmark.streaming.download_gt --runbook_file neurips23/streaming/simple_runbook.yaml --dataset msspacev-10M
+python -m benchmark.streaming.download_gt --runbook_file neurips23/streaming/delete_runbook.yaml --dataset msturing-10M-clustered
+python -m benchmark.streaming.download_gt --runbook_file neurips23/streaming/final_runbook.yaml  --dataset msturing-30M-clustered
 ```
 Alternately, to compute ground truth for an arbitrary runbook, [clone and build DiskANN repo](https://github.com/Microsoft/DiskANN) and use the command line tool to compute ground truth at various search checkpoints. The `--gt_cmdline_tool` points to the directory with DiskANN commandline tools.
 ```
 
@@ -83,4 +83,18 @@ msturing-10M-clustered:
           args: |
             [{"R":64, "L":50, "insert_threads":16, "consolidate_threads":16}]
           query-args: |
-            [{"Ls":100, "T":16}]
+            [{"Ls":100, "T":16}]
+msturing-30M-clustered:
+    diskann: 
+      docker-tag: neurips23-streaming-diskann
+      module: neurips23.streaming.diskann.diskann-str
+      constructor: diskann
+      base-args: ["@metric"]
+      run-groups:
+        base:
+          args: |
+            [{"R":32, "L":50, "insert_threads":16, "consolidate_threads":16},
+             {"R":32, "L":70, "insert_threads":16, "consolidate_threads":16},
+             {"R":50, "L":50, "insert_threads":16, "consolidate_threads":16}]
+          query-args: |
+            [{"Ls":70, "T":16}]
@@ -0,0 +1,177 @@
+import argparse
+import os
+import numpy as np
+import random
+import yaml
+
+from scipy.cluster.vq import vq, kmeans2
+from typing import Tuple
+from benchmark.datasets import DATASETS
+
+def cluster_and_permute(
+    data, num_clusters
+) -> Tuple[np.ndarray[int], np.ndarray[int]]:
+    """
+    Cluster the data and return permutation of row indices
+    that would group indices of the same cluster together
+    """
+    npts = np.shape(data)[0]
+    sample_size = min(100000, npts)
+    sample_indices = np.random.choice(range(npts), size=sample_size, replace=False)
+    sampled_data = data[sample_indices, :]
+    centroids, sample_labels = kmeans2(sampled_data, num_clusters, minit="++", iter=10)
+    labels, dist = vq(data, centroids)
+
+    count = np.zeros(num_clusters)
+    for i in range(npts):
+        count[labels[i]] += 1
+    print("Cluster counts")
+    print(count)
+
+    offsets = np.zeros(num_clusters + 1, dtype=int)
+    for i in range(0, num_clusters, 1):
+        offsets[i + 1] = offsets[i] + count[i]
+
+    permutation = np.zeros(npts, dtype=int)
+    counters = np.zeros(num_clusters, dtype=int)
+    for i in range(npts):
+        label = labels[i]
+        row = offsets[label] + counters[label]
+        counters[label] += 1
+        permutation[row] = i
+
+    return offsets, permutation
+
+
+def write_permuated_data(
+        data,
+        permutation:np.ndarray[int],
+        output_data_file:str
+):
+    permuted_data = data[permutation,:]
+
+    shape = np.shape(permuted_data)
+    with open(output_data_file, 'wb') as df:
+        df.write(shape[0].to_bytes(4, 'little'))
+        df.write(shape[1].to_bytes(4, 'little'))
+        df.write(permuted_data)
+
+
+def create_runbook(
+    dataset_str:str,
+    offsets:np.ndarray[int],
+    permutation:np.ndarray[int],
+    num_clusters:int, 
+    output_yaml_file:str
+):
+    ins_cursor_start = offsets.copy()
+    ins_cursor_end = offsets.copy()
+
+    del_cursor_start = offsets.copy()
+    del_cursor_end = offsets.copy()
+
+    operation_list = []
+    num_operations = 1
+    active_points = 0
+    max_pts = 0
+    active_points_in_cluster = np.zeros(num_clusters)
+
+    num_rounds = 5
+    sample = np.random.default_rng().dirichlet((100,15,10,5,3), num_clusters)
+    for c in range(num_clusters):
+        np.random.default_rng().shuffle(sample[c])
+    print(sample)
+
+    for round in range(num_rounds):
+        #insertions
+        for c in range(num_clusters):
+            delta = (int)((offsets[c+1]-offsets[c]) * sample[c,round])
+            ins_cursor_end[c] = ins_cursor_start[c] + delta
+            active_points += delta
+            max_pts = max(max_pts, active_points)
+            active_points_in_cluster[c] += delta
+            print('ins [', ins_cursor_start[c], ', ', ins_cursor_end[c], 
+                  ') active:', int(active_points_in_cluster[c]),
+                  'total:', active_points)
+            entry = [{'operation': 'insert'}, {'start': int(ins_cursor_start[c])}, {'end': int(ins_cursor_end[c])}]
+            operation_list.append((num_operations, entry))
+            num_operations += 1
+            operation_list.append((num_operations, [{'operation': str('search')}]))
+            num_operations += 1
+            ins_cursor_start[c] = ins_cursor_end[c]
+
+        #deletions
+        for c in range(num_clusters):
+            fraction = random.uniform(0.5,0.9)
+            delta = (int)(fraction*(ins_cursor_end[c]-del_cursor_start[c]))
+            del_cursor_end[c] = del_cursor_start[c] + delta
+            active_points -= delta
+            active_points_in_cluster[c] -= delta
+            print('del [', del_cursor_start[c], ',', del_cursor_end[c],
+                  ') active:', int(active_points_in_cluster[c]),
+                  'total:', active_points)
+            entry = [{'operation': 'delete'}, {'start': int(del_cursor_start[c])}, {'end': int(del_cursor_end[c])}]
+            operation_list.append((num_operations, entry))
+            num_operations += 1
+            operation_list.append((num_operations, [{'operation': 'search'}]))
+            num_operations += 1
+            del_cursor_start[c] = del_cursor_end[c]
+
+
+    with open(output_yaml_file, 'w') as yf:
+        operation_list.sort(key = lambda x: x[0])
+        sorted_dict = {}
+        sorted_dict['max_pts'] = int(max_pts)
+        for (k, v) in operation_list:
+            sorted_dict[k]=v
+        yaml_object = {}
+        yaml_object[dataset_str] = sorted_dict
+        yaml.dump(yaml_object, yf)
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+
+    parser.add_argument(
+        '--dataset',
+        choices=DATASETS.keys(),
+        required=True)
+    parser.add_argument(
+        '-c', '--num_clusters',
+        type=int,
+        required=True
+    )
+    parser.add_argument(
+        '-o', '--output_data_file',
+        required=True
+    )
+    parser.add_argument(
+        '-y', '--output_yaml_file',
+        required=True
+    )
+    args = parser.parse_args()
+
+    ds = DATASETS[args.dataset]()
+    if ds.nb <= 10**7:
+        data = ds.get_dataset()
+    else:
+        data = next(ds.get_dataset_iterator(bs=ds.nb))
+    print(np.shape(data))
+
+    offsets, permutation = cluster_and_permute(data, args.num_clusters)
+    print(permutation)
+
+    write_permuated_data(data=data, 
+                         permutation=permutation,
+                         output_data_file=args.output_data_file)
+
+    create_runbook(dataset_str=args.dataset,
+                   offsets=offsets,
+                   permutation=permutation, 
+                   num_clusters=args.num_clusters,
+                   output_yaml_file=args.output_yaml_file)
+
+
+if __name__ == '__main__':
+    main()
@@ -17,15 +17,16 @@ def build(algo, dataset, max_pts):
         algo.setup(ds.dtype, max_pts, ndims)
         print('Algorithm set up')
         return time.time() - t0
+    
 
 
     def run_task(algo, ds, distance, count, run_count, search_type, private_query, runbook):
         best_search_time = float('inf')
         search_times = []
         all_results = []
 
-        data = ds.get_dataset()
-        ids = np.arange(1, ds.nb+1, dtype=np.uint32)
+        # data = ds.get_dataset()
+        # ids = np.arange(1, ds.nb+1, dtype=np.uint32)
 
         Q = ds.get_queries() if not private_query else ds.get_private_queries()
         print(fr"Got {Q.shape[0]} queries")  
@@ -34,11 +35,13 @@ def run_task(algo, ds, distance, count, run_count, search_type, private_query, r
         result_map = {}
         num_searches = 0
         for step, entry in enumerate(runbook):
-            start = time.time()
+            start_time = time.time()
             match entry['operation']:
                 case 'insert':
-                    ids = np.arange(entry['start'], entry['end'], dtype=np.uint32)
-                    algo.insert(data[ids,:], ids)
+                    start = entry['start']
+                    end = entry['end']
+                    ids = np.arange(start, end, dtype=np.uint32)
+                    algo.insert(ds.get_data_in_range(start, end), ids)
                 case 'delete':
                     ids = np.arange(entry['start'], entry['end'], dtype=np.uint32)
                     algo.delete(ids)
@@ -56,7 +59,7 @@ def run_task(algo, ds, distance, count, run_count, search_type, private_query, r
                     num_searches += 1
                 case _:
                     raise NotImplementedError('Invalid runbook operation.')
-            step_time = (time.time() - start)
+            step_time = (time.time() - start_time)
             print(f"Step {step+1} took {step_time}s.")
 
         attrs = {
@@ -71,7 +74,7 @@ def run_task(algo, ds, distance, count, run_count, search_type, private_query, r
 
         for k, v in result_map.items():
             attrs['step_' + str(k)] = v
-            
+
         additional = algo.get_additional()
         for k in additional:
             attrs[k] = additional[k]
 
@@ -1,6 +1,6 @@
 ansicolors==1.1.8
 docker==6.1.2
-h5py==3.8.0
+h5py==3.10.0
 matplotlib==3.3.4
 numpy==1.24.2
 pyyaml==6.0