diff --git a/benchmark/datasets.py b/benchmark/datasets.py index e86a17f6..be63621d 100644 --- a/benchmark/datasets.py +++ b/benchmark/datasets.py @@ -432,6 +432,9 @@ def __init__(self): def distance(self): return "euclidean" + + def prepare(self, skip_data=False, original_size=10 ** 9): + return super().prepare(skip_data, original_size = self.nb) class MSSPACEV1B(DatasetCompetitionFormat): def __init__(self, nb_M=1000): @@ -491,7 +494,7 @@ def default_count(self): return 10 def prepare(self, skip_data=False, original_size=10 ** 9): - return super().prepare(skip_data, self.nb) + return super().prepare(skip_data, original_size = self.nb) class RandomRangeDS(DatasetCompetitionFormat): def __init__(self, nb, nq, d): diff --git a/neurips23/README.md b/neurips23/README.md index bb39c8a2..4f12cb0f 100644 --- a/neurips23/README.md +++ b/neurips23/README.md @@ -111,12 +111,15 @@ python run.py --neurips23track streaming --algorithm diskann --dataset msspacev- For streaming track, download the ground truth (needs azcopy in your binary path): ``` - python benchmark/streaming/download_gt.py --runbook_file neurips23/streaming/simple_runbook.yaml --dataset msspacev-10M +python benchmark/streaming/download_gt.py --runbook_file neurips23/streaming/simple_runbook.yaml --dataset msspacev-10M +python benchmark/streaming/download_gt.py --runbook_file neurips23/streaming/clustered_runbook.yaml --dataset msturing-10M-clustered ``` Alternately, to compute ground truth for an arbitrary runbook, [clone and build DiskANN repo](https://github.com/Microsoft/DiskANN) and use the command line tool to compute ground truth at various search checkpoints. The `--gt_cmdline_tool` points to the directory with DiskANN commandline tools. ``` python benchmark/streaming/compute_gt.py --dataset msspacev-10M --runbook neurips23/streaming/simple_runbook.yaml --gt_cmdline_tool ~/DiskANN/build/apps/utils/compute_groundtruth ``` +For streaming track, consider also the examples in [clustered runbook](neurips23/streaming/clustered_runbook.yaml). The datasets here are [generated](neurips23/streaming/clustered_data_gen.py) by clustering the original dataset with k-means and packing points in the same cluster into contiguous indices. Then insertions are then performed one cluster at a time. This runbook tests if an indexing algorithm can adapt to data draft. + To make the results available for post-processing, change permissions of the results folder