Merge pull request #445 from brj0/add-nndescent-algorithm

add nndescent algorithm
erikbern · Jul 28, 2023 · f5ba3ce · f5ba3ce
2 parents 6565385 + 5a062d0
commit f5ba3ce
Show file tree

Hide file tree

Showing 5 changed files with 267 additions and 2 deletions.
diff --git a/.github/workflows/benchmarks.yml b/.github/workflows/benchmarks.yml
@@ -45,6 +45,7 @@ jobs:
           - luceneknn
           - milvus
           - mrpt
+          - nndescent
           - n2
           - nmslib
           - onng_ngt
@@ -64,7 +65,7 @@ jobs:
           - vespa
           - weaviate
         include:
-          - library: pynndescent 
+          - library: pynndescent
             dataset: random-xs-16-hamming
           - library: datasketch
             dataset: random-s-jaccard
@@ -88,7 +89,7 @@ jobs:
 
     - name: Build Library Docker Image
       run: python3 install.py
-    
+
     - name: Run the benchmark
       run: |
         python3 run.py --docker-tag ann-benchmarks-${LIBRARY} --max-n-algorithms 3 --runs 2 --dataset $DATASET --run-disabled --timeout 300

diff --git a/README.md b/README.md
@@ -23,6 +23,7 @@ Evaluated
 * [FAISS](https://github.com/facebookresearch/faiss) ![https://img.shields.io/github/stars/facebookresearch/faiss?style=social](https://img.shields.io/github/stars/facebookresearch/faiss?style=social)
 * [DolphinnPy](https://github.com/ipsarros/DolphinnPy) ![https://img.shields.io/github/stars/ipsarros/DolphinnPy?style=social](https://img.shields.io/github/stars/ipsarros/DolphinnPy?style=social)
 * [Datasketch](https://github.com/ekzhu/datasketch) ![https://img.shields.io/github/stars/ekzhu/datasketch?style=social](https://img.shields.io/github/stars/ekzhu/datasketch?style=social)
+* [nndescent](https://github.com/brj0/nndescent) ![https://img.shields.io/github/stars/brj0/nndescent?style=social](https://img.shields.io/github/stars/brj0/nndescent?style=social)
 * [PyNNDescent](https://github.com/lmcinnes/pynndescent) ![https://img.shields.io/github/stars/lmcinnes/pynndescent?style=social](https://img.shields.io/github/stars/lmcinnes/pynndescent?style=social)
 * [MRPT](https://github.com/teemupitkanen/mrpt) ![https://img.shields.io/github/stars/teemupitkanen/mrpt?style=social](https://img.shields.io/github/stars/teemupitkanen/mrpt?style=social)
 * [NGT](https://github.com/yahoojapan/NGT) ![https://img.shields.io/github/stars/yahoojapan/NGT?style=social](https://img.shields.io/github/stars/yahoojapan/NGT?style=social): ONNG, PANNG, QG

diff --git a/ann_benchmarks/algorithms/nndescent/Dockerfile b/ann_benchmarks/algorithms/nndescent/Dockerfile
@@ -0,0 +1,5 @@
+FROM ann-benchmarks
+
+RUN pip3 install --verbose numpy nndescent>=1.0.4
+
+RUN python3 -c 'import nndescent'
diff --git a/ann_benchmarks/algorithms/nndescent/config.yml b/ann_benchmarks/algorithms/nndescent/config.yml
@@ -0,0 +1,138 @@
+bit:
+  hamming:
+  - base_args: ['@metric']
+    constructor: NNDescent
+    disabled: false
+    docker_tag: ann-benchmarks-nndescent
+    module: ann_benchmarks.algorithms.nndescent
+    name: nndescent
+    run_groups:
+      NN-120:
+        arg_groups: [{pruning_prob: [0.0, 1.0], leaf_size: 80, n_neighbors: [120],
+            pruning_degree_multiplier: [2.0, 2.5]}]
+        args: {}
+        query_args: [[0.08, 0.16, 0.2, 0.24, 0.28, 0.32, 0.36]]
+      NN-20:
+        arg_groups: [{pruning_prob: [0.75, 1.0], leaf_size: 32, n_neighbors: [20],
+            pruning_degree_multiplier: [1.0, 1.5]}]
+        args: {}
+        query_args: [[0.0, 0.01, 0.02, 0.04, 0.08, 0.12, 0.16]]
+      NN-40:
+        arg_groups: [{pruning_prob: [0.5, 1.0], leaf_size: 48, n_neighbors: [40],
+            pruning_degree_multiplier: [1.5, 2.0]}]
+        args: {}
+        query_args: [[0.0, 0.04, 0.08, 0.12, 0.16, 0.2, 0.24]]
+      NN-80:
+        arg_groups: [{pruning_prob: [0.25, 1.0], leaf_size: 64, n_neighbors: [80],
+            pruning_degree_multiplier: [1.75, 2.25]}]
+        args: {}
+        query_args: [[0.0, 0.08, 0.12, 0.16, 0.2, 0.24, 0.28, 0.32]]
+  jaccard:
+  - base_args: ['@metric']
+    constructor: NNDescent
+    disabled: false
+    docker_tag: ann-benchmarks-nndescent
+    module: ann_benchmarks.algorithms.nndescent
+    name: nndescent
+    run_groups:
+      NN-120:
+        arg_groups: [{pruning_prob: [1.0, 0.125], leaf_size: 80, n_neighbors: 120,
+            pruning_degree_multiplier: 1.0}]
+        args: {}
+        query_args: [[0.0, 0.02, 0.04, 0.06, 0.08, 0.12, 0.14, 0.16, 0.18, 0.2, 0.22]]
+      NN-20:
+        arg_groups: [{pruning_prob: [0.75, 1.0], leaf_size: 30, n_neighbors: 20,
+            pruning_degree_multiplier: 1.0}]
+        args: {}
+        query_args: [[0.0, 0.01, 0.02, 0.03, 0.04, 0.06, 0.08, 0.12, 0.16, 0.2]]
+      NN-40:
+        arg_groups: [{pruning_prob: [0.5, 1.0], leaf_size: 30, n_neighbors: 40,
+            pruning_degree_multiplier: 1.0}]
+        args: {}
+        query_args: [[0.0, 0.01, 0.02, 0.03, 0.04, 0.06, 0.08, 0.12, 0.16, 0.2]]
+      NN-80:
+        arg_groups: [{pruning_prob: [1.0, 0.25], leaf_size: 60, n_neighbors: 80,
+            pruning_degree_multiplier: 1.0}]
+        args: {}
+        query_args: [[0.0, 0.02, 0.04, 0.06, 0.08, 0.12, 0.14, 0.16, 0.18, 0.2, 0.22]]
+float:
+  angular:
+  - base_args: ['@metric']
+    constructor: NNDescent
+    disabled: false
+    docker_tag: ann-benchmarks-nndescent
+    module: ann_benchmarks.algorithms.nndescent
+    name: nndescent
+    run_groups:
+      NN-120-accurate:
+        arg_groups: [{pruning_prob: 0.125, leaf_size: 35, n_neighbors: 120, pruning_degree_multiplier: 2.5}]
+        args: {}
+        query_args: [[0.16, 0.2, 0.24, 0.28, 0.32, 0.36]]
+      NN-120-fast:
+        arg_groups: [{pruning_prob: 1.0, leaf_size: 20, n_neighbors: 120, pruning_degree_multiplier: 2.5}]
+        args: {}
+        query_args: [[0.0, 0.04, 0.08, 0.16, 0.2, 0.24, 0.28, 0.32]]
+      NN-20:
+        arg_groups: [{pruning_prob: [1.0], leaf_size: 20, n_neighbors: [20], pruning_degree_multiplier: [
+              0.5, 1.0]}]
+        args: {}
+        query_args: [[0.0, 0.02, 0.04, 0.06, 0.08, 0.1, 0.12]]
+      NN-40:
+        arg_groups: [{pruning_prob: [0.5, 1.0], leaf_size: 25, n_neighbors: [40],
+            pruning_degree_multiplier: [1.5]}]
+        args: {}
+        query_args: [[0.0, 0.02, 0.04, 0.06, 0.08, 0.1, 0.12, 0.14, 0.16]]
+      NN-80-accurate:
+        arg_groups: [{pruning_prob: 0.25, leaf_size: 30, n_neighbors: 80, pruning_degree_multiplier: 2.0}]
+        args: {}
+        query_args: [[0.08, 0.12, 0.16, 0.2, 0.24, 0.28, 0.32, 0.36]]
+      NN-80-fast:
+        arg_groups: [{pruning_prob: 1.0, leaf_size: 20, n_neighbors: 80, pruning_degree_multiplier: 2.0}]
+        args: {}
+        query_args: [[0.0, 0.02, 0.04, 0.08, 0.12, 0.16, 0.2, 0.24]]
+  any:
+  - base_args: ['@metric']
+    constructor: NNDescent
+    disabled: false
+    docker_tag: ann-benchmarks-nndescent
+    module: ann_benchmarks.algorithms.nndescent
+    name: nndescent
+    run_groups:
+      NN-10-20:
+        arg_groups: [{pruning_prob: [1.0], leaf_size: 32, n_neighbors: [10, 20],
+            pruning_degree_multiplier: [1.5, 2.0]}]
+        args: {}
+        query_args: [[0.0, 0.02, 0.04, 0.06, 0.08, 0.1, 0.12, 0.14, 0.16]]
+      NN-40-80:
+        arg_groups: [{pruning_prob: [0.0, 1.0], leaf_size: 64, n_neighbors: [40,
+              80], pruning_degree_multiplier: [2.0, 2.5]}]
+        args: {}
+        query_args: [[0.0, 0.04, 0.08, 0.12, 0.16, 0.2, 0.24, 0.28, 0.32]]
+  euclidean:
+  - base_args: ['@metric']
+    constructor: NNDescent
+    disabled: false
+    docker_tag: ann-benchmarks-nndescent
+    module: ann_benchmarks.algorithms.nndescent
+    name: nndescent
+    run_groups:
+      NN-10:
+        arg_groups: [{pruning_prob: 1.0, leaf_size: 24, n_neighbors: 10, pruning_degree_multiplier: [
+              0.5, 1.0]}]
+        args: {}
+        query_args: [[0.0, 0.01, 0.02, 0.03, 0.04, 0.06, 0.08, 0.1, 0.12]]
+      NN-20:
+        arg_groups: [{pruning_prob: 1.0, leaf_size: 24, n_neighbors: 20, pruning_degree_multiplier: [
+              0.75, 1.5]}]
+        args: {}
+        query_args: [[0.0, 0.02, 0.04, 0.06, 0.08, 0.1, 0.12, 0.14, 0.16, 0.2]]
+      NN-40:
+        arg_groups: [{pruning_prob: [0.0, 1.0], leaf_size: 36, n_neighbors: 40,
+            pruning_degree_multiplier: [1.0, 2.0]}]
+        args: {}
+        query_args: [[0.0, 0.02, 0.04, 0.08, 0.12, 0.16, 0.2, 0.24, 0.28, 0.32]]
+      NN-60:
+        arg_groups: [{pruning_prob: 0.0, leaf_size: 48, n_neighbors: 60, pruning_degree_multiplier: [
+              2.0, 3.0]}]
+        args: {}
+        query_args: [[0.0, 0.04, 0.08, 0.12, 0.16, 0.2, 0.24, 0.28, 0.32, 0.36]]
diff --git a/ann_benchmarks/algorithms/nndescent/module.py b/ann_benchmarks/algorithms/nndescent/module.py
@@ -0,0 +1,120 @@
+import numpy as np
+import nndescent
+import scipy.sparse
+
+from ..base.module import BaseANN
+
+
+class NNDescent(BaseANN):
+    def __init__(self, metric, index_param_dict):
+        if "n_neighbors" in index_param_dict:
+            self.n_neighbors = int(index_param_dict["n_neighbors"])
+        else:
+            self.n_neighbors = 30
+
+        if "pruning_degree_multiplier" in index_param_dict:
+            self.pruning_degree_multiplier = float(
+                index_param_dict["pruning_degree_multiplier"]
+            )
+        else:
+            self.pruning_degree_multiplier = 1.5
+
+        if "pruning_prob" in index_param_dict:
+            self.pruning_prob = float(index_param_dict["pruning_prob"])
+        else:
+            self.pruning_prob = 1.0
+
+        if "leaf_size" in index_param_dict:
+            self.leaf_size = int(index_param_dict["leaf_size"])
+
+        self.is_sparse = metric in ["jaccard"]
+
+        self.nnd_metric = {
+            "angular": "dot",
+            "euclidean": "euclidean",
+            "hamming": "hamming",
+            "jaccard": "jaccard",
+        }[metric]
+
+    def fit(self, X):
+        if self.is_sparse:
+            # Convert to sparse matrix format
+            if type(X) == list:
+                sizes = [len(x) for x in X]
+                n_cols = max([max(x) for x in X]) + 1
+                matrix = scipy.sparse.csr_matrix(
+                    (len(X), n_cols), dtype=np.float32
+                )
+                matrix.indices = np.hstack(X).astype(np.int32)
+                matrix.indptr = np.concatenate([[0], np.cumsum(sizes)]).astype(
+                    np.int32
+                )
+                matrix.data = np.ones(
+                    matrix.indices.shape[0], dtype=np.float32
+                )
+                matrix.sort_indices()
+                X = matrix
+            else:
+                X = scipy.sparse.csr_matrix(X)
+
+            self.query_matrix = scipy.sparse.csr_matrix(
+                (1, X.shape[1]), dtype=np.float32
+            )
+        elif not isinstance(X, np.ndarray) or X.dtype != np.float32:
+            print("Convert data to float32")
+            X = np.asarray(X, dtype=np.float32)
+
+        # nndescent uses pointers to the data. Make shure X does not change
+        # outside of this scope.
+        self.X = X
+        self.index = nndescent.NNDescent(
+            self.X,
+            n_neighbors=self.n_neighbors,
+            metric=self.nnd_metric,
+            leaf_size=self.leaf_size,
+            pruning_degree_multiplier=self.pruning_degree_multiplier,
+            pruning_prob=self.pruning_prob,
+            verbose=True,
+        )
+        # Make a dummy query to prepare the search graph.
+        if self.is_sparse:
+            empty_mtx = np.empty((0, X.shape[0]), dtype=np.float32)
+            empty_csr = scipy.sparse.csr_matrix(empty_mtx)
+            self.index.query(empty_csr, k=1, epsilon=0.1)
+        else:
+            empty_mtx = np.empty((0, X.shape[0]), dtype=np.float32)
+            self.index.query(empty_mtx, k=1, epsilon=0.1)
+
+    def set_query_arguments(self, epsilon=0.1):
+        self.epsilon = float(epsilon)
+
+    def query(self, v, n):
+        if self.is_sparse:
+            # Convert index array to sparse matrix format and query; the
+            # overhead of direct conversion is high for single queries
+            # (converting the entire test dataset and sending single rows is
+            # better), so we just populate the required structures.
+            if v.dtype == np.bool_:
+                self.query_matrix.indices = np.flatnonzero(v).astype(np.int32)
+            else:
+                self.query_matrix.indices = v.astype(np.int32)
+            size = self.query_matrix.indices.shape[0]
+            self.query_matrix.indptr = np.array([0, size], dtype=np.int32)
+            self.query_matrix.data = np.ones(size, dtype=np.float32)
+            ind, dist = self.index.query(
+                self.query_matrix, k=n, epsilon=self.epsilon
+            )
+        else:
+            ind, dist = self.index.query(
+                v.reshape(1, -1).astype("float32"), k=n, epsilon=self.epsilon
+            )
+        return ind[0]
+
+    def __str__(self):
+        return (
+            f"NNDescent(n_neighbors={self.n_neighbors}, "
+            f"pruning_mult={self.pruning_degree_multiplier:.2f}, "
+            f"pruning_prob={self.pruning_prob:.3f}, "
+            f"epsilon={self.epsilon:.3f}, "
+            f"leaf_size={self.leaf_size:02d})"
+        )