Merge pull request #219 from nmslib/develop

Update master to 0.4.0
nmslib · Jun 22, 2020 · 3c6a84f · 3c6a84f
2 parents a97ec89 + 92e5b74
commit 3c6a84f
Show file tree

Hide file tree

Showing 10 changed files with 765 additions and 112 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -5,13 +5,22 @@ include_directories("${PROJECT_BINARY_DIR}")
 
 
 
-set(SOURCE_EXE main.cpp)            
+set(SOURCE_EXE main.cpp)           
 
 set(SOURCE_LIB sift_1b.cpp)
 
 add_library(sift_test STATIC ${SOURCE_LIB})
 
 
 add_executable(main ${SOURCE_EXE})
+if (CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
+  SET( CMAKE_CXX_FLAGS  "-Ofast -DNDEBUG -std=c++11 -DHAVE_CXX0X -openmp -march=native -fpic -ftree-vectorize")
+elseif (CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
 SET( CMAKE_CXX_FLAGS  "-Ofast -lrt -DNDEBUG -std=c++11 -DHAVE_CXX0X -openmp -march=native -fpic -w -fopenmp -ftree-vectorize -ftree-vectorizer-verbose=0" )
+elseif (CMAKE_CXX_COMPILER_ID STREQUAL "MSVC")
+  SET( CMAKE_CXX_FLAGS  "-Ofast -lrt -DNDEBUG -std=c++11 -DHAVE_CXX0X -openmp -march=native -fpic -w -fopenmp -ftree-vectorize" )
+endif()
+
+add_executable(test_updates examples/updates_test.cpp)
+
 target_link_libraries(main sift_test) 
diff --git a/README.md b/README.md
@@ -1,9 +1,13 @@
 # Hnswlib - fast approximate nearest neighbor search
-Header-only C++ HNSW implementation with python bindings. Paper code for the HNSW 200M SIFT experiment
+Header-only C++ HNSW implementation with python bindings. Paper's code for the HNSW 200M SIFT experiment
 
 **NEWS:**
 
-**Thanks to Louis Abraham ([@louisabraham](https://github.com/louisabraham)) hnswlib is now can be installed via pip!**
+* **Thanks to Apoorv Sharma [@apoorv-sharma](https://github.com/apoorv-sharma), hnswlib now supports true element updates (the interface remained the same, but when you the perfromance/memory should not degrade as you update the element embeddinds).**
+
+* **Thanks to Dmitry [@2ooom](https://github.com/2ooom), hnswlib got a boost in performance for vector dimensions that are not mutiple of 4** 
+
+* **Thanks to Louis Abraham ([@louisabraham](https://github.com/louisabraham)) hnswlib can now be installed via pip!**
 
 Highlights:
 1) Lightweight, header-only, no dependencies other than C++ 11.
@@ -23,10 +27,10 @@ Description of the algorithm parameters can be found in [ALGO_PARAMS.md](ALGO_PA
 | Distance         | parameter       | Equation                |
 | -------------    |:---------------:| -----------------------:|
 |Squared L2        |'l2'             | d = sum((Ai-Bi)^2)      |
-|Inner product     |'ip'             | d = 1.0 - sum(Ai\*Bi))  |
+|Inner product     |'ip'             | d = 1.0 - sum(Ai\*Bi)   |
 |Cosine similarity |'cosine'         | d = 1.0 - sum(Ai\*Bi) / sqrt(sum(Ai\*Ai) * sum(Bi\*Bi))|
 
-Note that inner product is not an actual metric. An element can be closer to some other element than to itself.
+Note that inner product is not an actual metric. An element can be closer to some other element than to itself. That allows some speedup if you remove all elements that are not the closest to themselves from the index.
 
 For other spaces use the nmslib library https://github.com/nmslib/nmslib. 
 
@@ -42,6 +46,7 @@ Index methods:
 * `add_items(data, data_labels, num_threads = -1)` - inserts the `data`(numpy array of vectors, shape:`N*dim`) into the structure. 
     * `labels` is an optional N-size numpy array of integer labels for all elements in `data`.
     * `num_threads` sets the number of cpu threads to use (-1 means use default).
+    * `data_labels` specifies the labels for the data. If index already has the elements with the same labels, their features will be updated. Note that update procedure is slower than insertion of a new element, but more memory- and query-efficient.
     * Thread-safe with other `add_items` calls, but not with `knn_query`.
 
 * `mark_deleted(data_label)`  - marks the element as deleted, so it will be ommited from search results.
@@ -223,6 +228,29 @@ To run the test on 200M SIFT subset:
 
 The size of the bigann subset (in millions) is controlled by the variable **subset_size_milllions** hardcoded in **sift_1b.cpp**.
 
+### Updates test
+To generate testing data (from root directory):
+```bash
+cd examples
+python update_gen_data.py
+```
+To compile (from root directory):
+```bash
+mkdir build
+cd build
+cmake ..
+make 
+```
+To run test **without** updates (from `build` directory)
+```bash
+./test_updates
+```
+
+To run test **with** updates (from `build` directory)
+```bash
+./test_updates update
+```
+
 ### HNSW example demos
 
 - Visual search engine for 1M amazon products (MXNet + HNSW): [website](https://thomasdelteil.github.io/VisualSearch_MXNet/), [code](https://github.com/ThomasDelteil/VisualSearch_MXNet), demo by [@ThomasDelteil](https://github.com/ThomasDelteil)

diff --git a/examples/update_gen_data.py b/examples/update_gen_data.py
@@ -0,0 +1,37 @@
+import numpy as np
+import os
+
+def normalized(a, axis=-1, order=2):
+    l2 = np.atleast_1d(np.linalg.norm(a, order, axis))
+    l2[l2==0] = 1
+    return a / np.expand_dims(l2, axis)
+
+N=100000
+dummy_data_multiplier=3
+N_queries = 1000
+d=8
+K=5
+
+np.random.seed(1)
+
+print("Generating data...")
+batches_dummy= [ normalized(np.float32(np.random.random( (N,d)))) for _ in range(dummy_data_multiplier)]
+batch_final = normalized (np.float32(np.random.random( (N,d))))
+queries = normalized(np.float32(np.random.random( (N_queries,d))))
+print("Computing distances...")
+dist=np.dot(queries,batch_final.T)
+topk=np.argsort(-dist)[:,:K]
+print("Saving...")
+
+try:
+    os.mkdir("data")
+except OSError as e:
+    pass
+
+for idx, batch_dummy in enumerate(batches_dummy):
+    batch_dummy.tofile('data/batch_dummy_%02d.bin' % idx)
+batch_final.tofile('data/batch_final.bin')
+queries.tofile('data/queries.bin')
+np.int32(topk).tofile('data/gt.bin')
+with open("data/config.txt", "w") as file:
+    file.write("%d %d %d %d %d" %(N, dummy_data_multiplier, N_queries, d, K))