diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 8cfa469a..b75c72c7 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -7,7 +7,7 @@ jobs: runs-on: ${{matrix.os}} strategy: matrix: - os: [ubuntu-latest, windows-latest] + os: [ubuntu-latest, windows-latest, macos-latest] python-version: ["3.7", "3.8", "3.9", "3.10"] steps: - uses: actions/checkout@v3 @@ -28,7 +28,7 @@ jobs: runs-on: ${{matrix.os}} strategy: matrix: - os: [ubuntu-latest, windows-latest] + os: [ubuntu-latest, windows-latest, macos-latest] steps: - uses: actions/checkout@v3 - uses: actions/setup-python@v4 @@ -40,10 +40,10 @@ jobs: mkdir build cd build cmake .. - if [ "$RUNNER_OS" == "Linux" ]; then - make - elif [ "$RUNNER_OS" == "Windows" ]; then + if [ "$RUNNER_OS" == "Windows" ]; then cmake --build ./ --config Release + else + make fi shell: bash @@ -67,10 +67,14 @@ jobs: ./example_mt_search ./example_mt_filter ./example_mt_replace_deleted + ./example_multivector_search + ./example_epsilon_search ./searchKnnCloserFirst_test ./searchKnnWithFilter_test ./multiThreadLoad_test ./multiThread_replace_test ./test_updates ./test_updates update + ./multivector_search_test + ./epsilon_search_test shell: bash diff --git a/.gitignore b/.gitignore index 48f74604..d46c9890 100644 --- a/.gitignore +++ b/.gitignore @@ -10,3 +10,4 @@ var/ .vscode/ .vs/ **.DS_Store +*.pyc diff --git a/CMakeLists.txt b/CMakeLists.txt index 7cebe600..be0d40f0 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,25 +1,68 @@ -cmake_minimum_required (VERSION 2.6) -project(hnsw_lib +cmake_minimum_required(VERSION 3.0...3.26) + +project(hnswlib LANGUAGES CXX) +include(GNUInstallDirs) +include(CheckCXXCompilerFlag) + add_library(hnswlib INTERFACE) -target_include_directories(hnswlib INTERFACE .) +add_library(hnswlib::hnswlib ALIAS hnswlib) + +target_include_directories(hnswlib INTERFACE + $ + $) + +# Install +install(DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/hnswlib + DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}) + +install(TARGETS hnswlib + EXPORT hnswlibTargets) + +install(EXPORT hnswlibTargets + FILE hnswlibConfig.cmake + NAMESPACE hnswlib:: + DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/hnswlib) +# Examples and tests if(CMAKE_PROJECT_NAME STREQUAL PROJECT_NAME) + option(HNSWLIB_EXAMPLES "Build examples and tests." ON) +else() + option(HNSWLIB_EXAMPLES "Build examples and tests." OFF) +endif() +if(HNSWLIB_EXAMPLES) set(CMAKE_CXX_STANDARD 11) - if (CMAKE_CXX_COMPILER_ID STREQUAL "Clang") - SET( CMAKE_CXX_FLAGS "-Ofast -DNDEBUG -std=c++11 -DHAVE_CXX0X -openmp -march=native -fpic -ftree-vectorize") + if (CMAKE_CXX_COMPILER_ID MATCHES "Clang") + SET( CMAKE_CXX_FLAGS "-Ofast -std=c++11 -DHAVE_CXX0X -openmp -fpic -ftree-vectorize" ) + check_cxx_compiler_flag("-march=native" COMPILER_SUPPORT_NATIVE_FLAG) + if(COMPILER_SUPPORT_NATIVE_FLAG) + SET( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=native" ) + message("set -march=native flag") + else() + check_cxx_compiler_flag("-mcpu=apple-m1" COMPILER_SUPPORT_M1_FLAG) + if(COMPILER_SUPPORT_M1_FLAG) + SET( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mcpu=apple-m1" ) + message("set -mcpu=apple-m1 flag") + endif() + endif() elseif (CMAKE_CXX_COMPILER_ID STREQUAL "GNU") - SET( CMAKE_CXX_FLAGS "-Ofast -lrt -DNDEBUG -std=c++11 -DHAVE_CXX0X -march=native -fpic -w -fopenmp -ftree-vectorize -ftree-vectorizer-verbose=0" ) + SET( CMAKE_CXX_FLAGS "-Ofast -lrt -std=c++11 -DHAVE_CXX0X -march=native -fpic -w -fopenmp -ftree-vectorize -ftree-vectorizer-verbose=0" ) elseif (CMAKE_CXX_COMPILER_ID STREQUAL "MSVC") - SET( CMAKE_CXX_FLAGS "-Ofast -lrt -DNDEBUG -std=c++11 -DHAVE_CXX0X -openmp -march=native -fpic -w -fopenmp -ftree-vectorize" ) + SET( CMAKE_CXX_FLAGS "/O2 -DHAVE_CXX0X /W1 /openmp /EHsc" ) endif() # examples add_executable(example_search examples/cpp/example_search.cpp) target_link_libraries(example_search hnswlib) + add_executable(example_epsilon_search examples/cpp/example_epsilon_search.cpp) + target_link_libraries(example_epsilon_search hnswlib) + + add_executable(example_multivector_search examples/cpp/example_multivector_search.cpp) + target_link_libraries(example_multivector_search hnswlib) + add_executable(example_filter examples/cpp/example_filter.cpp) target_link_libraries(example_filter hnswlib) @@ -36,6 +79,12 @@ if(CMAKE_PROJECT_NAME STREQUAL PROJECT_NAME) target_link_libraries(example_mt_replace_deleted hnswlib) # tests + add_executable(multivector_search_test tests/cpp/multivector_search_test.cpp) + target_link_libraries(multivector_search_test hnswlib) + + add_executable(epsilon_search_test tests/cpp/epsilon_search_test.cpp) + target_link_libraries(epsilon_search_test hnswlib) + add_executable(test_updates tests/cpp/updates_test.cpp) target_link_libraries(test_updates hnswlib) diff --git a/README.md b/README.md index 3ed466a7..6eefcf20 100644 --- a/README.md +++ b/README.md @@ -3,6 +3,15 @@ Header-only C++ HNSW implementation with python bindings, insertions and updates **NEWS:** +**version 0.8.0** + +* Multi-vector document search and epsilon search (for now, only in C++) +* By default, there is no statistic aggregation, which speeds up the multi-threaded search (it does not seem like people are using it anyway: [Issue #495](https://github.com/nmslib/hnswlib/issues/495)). +* Various bugfixes and improvements +* `get_items` now have `return_type` parameter, which can be either 'numpy' or 'list' + +Full list of changes: https://github.com/nmslib/hnswlib/pull/523 + **version 0.7.0** * Added support to filtering (#402, #430) by [@kishorenc](https://github.com/kishorenc) @@ -79,7 +88,7 @@ For other spaces use the nmslib library https://github.com/nmslib/nmslib. * `set_num_threads(num_threads)` set the default number of cpu threads used during data insertion/querying. -* `get_items(ids)` - returns a numpy array (shape:`N*dim`) of vectors that have integer identifiers specified in `ids` numpy vector (shape:`N`). Note that for cosine similarity it currently returns **normalized** vectors. +* `get_items(ids, return_type = 'numpy')` - returns a numpy array (shape:`N*dim`) of vectors that have integer identifiers specified in `ids` numpy vector (shape:`N`) if `return_type` is `list` return list of lists. Note that for cosine similarity it currently returns **normalized** vectors. * `get_ids_list()` - returns a list of all elements' ids. @@ -229,6 +238,8 @@ print("Recall for two batches:", np.mean(labels.reshape(-1) == np.arange(len(dat * filtering during the search with a boolean function * deleting the elements and reusing the memory of the deleted elements for newly added elements * multithreaded usage +* multivector search +* epsilon search ### Bindings installation diff --git a/examples/cpp/EXAMPLES.md b/examples/cpp/EXAMPLES.md index 3af603d4..5f9adc30 100644 --- a/examples/cpp/EXAMPLES.md +++ b/examples/cpp/EXAMPLES.md @@ -182,4 +182,8 @@ int main() { Multithreaded examples: * Creating index, inserting elements, searching [example_mt_search.cpp](example_mt_search.cpp) * Filtering during the search with a boolean function [example_mt_filter.cpp](example_mt_filter.cpp) -* Reusing the memory of the deleted elements when new elements are being added [example_mt_replace_deleted.cpp](example_mt_replace_deleted.cpp) \ No newline at end of file +* Reusing the memory of the deleted elements when new elements are being added [example_mt_replace_deleted.cpp](example_mt_replace_deleted.cpp) + +More examples: +* Multivector search [example_multivector_search.cpp](example_multivector_search.cpp) +* Epsilon search [example_epsilon_search.cpp](example_epsilon_search.cpp) \ No newline at end of file diff --git a/examples/cpp/example_epsilon_search.cpp b/examples/cpp/example_epsilon_search.cpp new file mode 100644 index 00000000..49eec408 --- /dev/null +++ b/examples/cpp/example_epsilon_search.cpp @@ -0,0 +1,66 @@ +#include "../../hnswlib/hnswlib.h" + +typedef unsigned int docidtype; +typedef float dist_t; + +int main() { + int dim = 16; // Dimension of the elements + int max_elements = 10000; // Maximum number of elements, should be known beforehand + int M = 16; // Tightly connected with internal dimensionality of the data + // strongly affects the memory consumption + int ef_construction = 200; // Controls index search speed/build speed tradeoff + int min_num_candidates = 100; // Minimum number of candidates to search in the epsilon region + // this parameter is similar to ef + + int num_queries = 5; + float epsilon2 = 2.0; // Squared distance to query + + // Initing index + hnswlib::L2Space space(dim); + hnswlib::HierarchicalNSW* alg_hnsw = new hnswlib::HierarchicalNSW(&space, max_elements, M, ef_construction); + + // Generate random data + std::mt19937 rng; + rng.seed(47); + std::uniform_real_distribution<> distrib_real; + + size_t data_point_size = space.get_data_size(); + char* data = new char[data_point_size * max_elements]; + for (int i = 0; i < max_elements; i++) { + char* point_data = data + i * data_point_size; + for (int j = 0; j < dim; j++) { + char* vec_data = point_data + j * sizeof(float); + float value = distrib_real(rng); + *(float*)vec_data = value; + } + } + + // Add data to index + for (int i = 0; i < max_elements; i++) { + hnswlib::labeltype label = i; + char* point_data = data + i * data_point_size; + alg_hnsw->addPoint(point_data, label); + } + + // Query random vectors + for (int i = 0; i < num_queries; i++) { + char* query_data = new char[data_point_size]; + for (int j = 0; j < dim; j++) { + size_t offset = j * sizeof(float); + char* vec_data = query_data + offset; + float value = distrib_real(rng); + *(float*)vec_data = value; + } + std::cout << "Query #" << i << "\n"; + hnswlib::EpsilonSearchStopCondition stop_condition(epsilon2, min_num_candidates, max_elements); + std::vector> result = + alg_hnsw->searchStopConditionClosest(query_data, stop_condition); + size_t num_vectors = result.size(); + std::cout << "Found " << num_vectors << " vectors\n"; + delete[] query_data; + } + + delete[] data; + delete alg_hnsw; + return 0; +} diff --git a/examples/cpp/example_multivector_search.cpp b/examples/cpp/example_multivector_search.cpp new file mode 100644 index 00000000..06aafe0b --- /dev/null +++ b/examples/cpp/example_multivector_search.cpp @@ -0,0 +1,83 @@ +#include "../../hnswlib/hnswlib.h" + +typedef unsigned int docidtype; +typedef float dist_t; + +int main() { + int dim = 16; // Dimension of the elements + int max_elements = 10000; // Maximum number of elements, should be known beforehand + int M = 16; // Tightly connected with internal dimensionality of the data + // strongly affects the memory consumption + int ef_construction = 200; // Controls index search speed/build speed tradeoff + + int num_queries = 5; + int num_docs = 5; // Number of documents to search + int ef_collection = 6; // Number of candidate documents during the search + // Controlls the recall: higher ef leads to better accuracy, but slower search + docidtype min_doc_id = 0; + docidtype max_doc_id = 9; + + // Initing index + hnswlib::MultiVectorL2Space space(dim); + hnswlib::HierarchicalNSW* alg_hnsw = new hnswlib::HierarchicalNSW(&space, max_elements, M, ef_construction); + + // Generate random data + std::mt19937 rng; + rng.seed(47); + std::uniform_real_distribution<> distrib_real; + std::uniform_int_distribution distrib_docid(min_doc_id, max_doc_id); + + size_t data_point_size = space.get_data_size(); + char* data = new char[data_point_size * max_elements]; + for (int i = 0; i < max_elements; i++) { + // set vector value + char* point_data = data + i * data_point_size; + for (int j = 0; j < dim; j++) { + char* vec_data = point_data + j * sizeof(float); + float value = distrib_real(rng); + *(float*)vec_data = value; + } + // set document id + docidtype doc_id = distrib_docid(rng); + space.set_doc_id(point_data, doc_id); + } + + // Add data to index + std::unordered_map label_docid_lookup; + for (int i = 0; i < max_elements; i++) { + hnswlib::labeltype label = i; + char* point_data = data + i * data_point_size; + alg_hnsw->addPoint(point_data, label); + label_docid_lookup[label] = space.get_doc_id(point_data); + } + + // Query random vectors + size_t query_size = dim * sizeof(float); + for (int i = 0; i < num_queries; i++) { + char* query_data = new char[query_size]; + for (int j = 0; j < dim; j++) { + size_t offset = j * sizeof(float); + char* vec_data = query_data + offset; + float value = distrib_real(rng); + *(float*)vec_data = value; + } + std::cout << "Query #" << i << "\n"; + hnswlib::MultiVectorSearchStopCondition stop_condition(space, num_docs, ef_collection); + std::vector> result = + alg_hnsw->searchStopConditionClosest(query_data, stop_condition); + size_t num_vectors = result.size(); + + std::unordered_map doc_counter; + for (auto pair: result) { + hnswlib::labeltype label = pair.second; + docidtype doc_id = label_docid_lookup[label]; + doc_counter[doc_id] += 1; + } + std::cout << "Found " << doc_counter.size() << " documents, " << num_vectors << " vectors\n"; + delete[] query_data; + } + + delete[] data; + delete alg_hnsw; + return 0; +} diff --git a/hnswlib/bruteforce.h b/hnswlib/bruteforce.h index 30b33ae9..8727cc8a 100644 --- a/hnswlib/bruteforce.h +++ b/hnswlib/bruteforce.h @@ -84,10 +84,16 @@ class BruteforceSearch : public AlgorithmInterface { void removePoint(labeltype cur_external) { - size_t cur_c = dict_external_to_internal[cur_external]; + std::unique_lock lock(index_lock); - dict_external_to_internal.erase(cur_external); + auto found = dict_external_to_internal.find(cur_external); + if (found == dict_external_to_internal.end()) { + return; + } + + dict_external_to_internal.erase(found); + size_t cur_c = found->second; labeltype label = *((labeltype*)(data_ + size_per_element_ * (cur_element_count-1) + data_size_)); dict_external_to_internal[label] = cur_c; memcpy(data_ + size_per_element_ * cur_c, @@ -106,7 +112,7 @@ class BruteforceSearch : public AlgorithmInterface { dist_t dist = fstdistfunc_(query_data, data_ + size_per_element_ * i, dist_func_param_); labeltype label = *((labeltype*) (data_ + size_per_element_ * i + data_size_)); if ((!isIdAllowed) || (*isIdAllowed)(label)) { - topResults.push(std::pair(dist, label)); + topResults.emplace(dist, label); } } dist_t lastdist = topResults.empty() ? std::numeric_limits::max() : topResults.top().first; @@ -115,7 +121,7 @@ class BruteforceSearch : public AlgorithmInterface { if (dist <= lastdist) { labeltype label = *((labeltype *) (data_ + size_per_element_ * i + data_size_)); if ((!isIdAllowed) || (*isIdAllowed)(label)) { - topResults.push(std::pair(dist, label)); + topResults.emplace(dist, label); } if (topResults.size() > k) topResults.pop(); diff --git a/hnswlib/hnswalg.h b/hnswlib/hnswalg.h index bef00170..e269ae69 100644 --- a/hnswlib/hnswalg.h +++ b/hnswlib/hnswalg.h @@ -8,6 +8,7 @@ #include #include #include +#include namespace hnswlib { typedef unsigned int tableint; @@ -33,7 +34,7 @@ class HierarchicalNSW : public AlgorithmInterface { double mult_{0.0}, revSize_{0.0}; int maxlevel_{0}; - VisitedListPool *visited_list_pool_{nullptr}; + std::unique_ptr visited_list_pool_{nullptr}; // Locks operations with element by label value mutable std::vector label_op_locks_; @@ -92,8 +93,8 @@ class HierarchicalNSW : public AlgorithmInterface { size_t ef_construction = 200, size_t random_seed = 100, bool allow_replace_deleted = false) - : link_list_locks_(max_elements), - label_op_locks_(MAX_LABEL_OPERATION_LOCKS), + : label_op_locks_(MAX_LABEL_OPERATION_LOCKS), + link_list_locks_(max_elements), element_levels_(max_elements), allow_replace_deleted_(allow_replace_deleted) { max_elements_ = max_elements; @@ -101,7 +102,13 @@ class HierarchicalNSW : public AlgorithmInterface { data_size_ = s->get_data_size(); fstdistfunc_ = s->get_dist_func(); dist_func_param_ = s->get_dist_func_param(); - M_ = M; + if ( M <= 10000 ) { + M_ = M; + } else { + HNSWERR << "warning: M parameter exceeds 10000 which may lead to adverse effects." << std::endl; + HNSWERR << " Cap to 10000 will be applied for the rest of the processing." << std::endl; + M_ = 10000; + } maxM_ = M_; maxM0_ = M_ * 2; ef_construction_ = std::max(ef_construction, M_); @@ -122,7 +129,7 @@ class HierarchicalNSW : public AlgorithmInterface { cur_element_count = 0; - visited_list_pool_ = new VisitedListPool(1, max_elements); + visited_list_pool_ = std::unique_ptr(new VisitedListPool(1, max_elements)); // initializations for special treatment of the first node enterpoint_node_ = -1; @@ -138,13 +145,20 @@ class HierarchicalNSW : public AlgorithmInterface { ~HierarchicalNSW() { + clear(); + } + + void clear() { free(data_level0_memory_); + data_level0_memory_ = nullptr; for (tableint i = 0; i < cur_element_count; i++) { if (element_levels_[i] > 0) free(linkLists_[i]); } free(linkLists_); - delete visited_list_pool_; + linkLists_ = nullptr; + cur_element_count = 0; + visited_list_pool_.reset(nullptr); } @@ -291,9 +305,15 @@ class HierarchicalNSW : public AlgorithmInterface { } - template + // bare_bone_search means there is no check for deletions and stop condition is ignored in return of extra performance + template std::priority_queue, std::vector>, CompareByFirst> - searchBaseLayerST(tableint ep_id, const void *data_point, size_t ef, BaseFilterFunctor* isIdAllowed = nullptr) const { + searchBaseLayerST( + tableint ep_id, + const void *data_point, + size_t ef, + BaseFilterFunctor* isIdAllowed = nullptr, + BaseSearchStopCondition* stop_condition = nullptr) const { VisitedList *vl = visited_list_pool_->getFreeVisitedList(); vl_type *visited_array = vl->mass; vl_type visited_array_tag = vl->curV; @@ -302,10 +322,15 @@ class HierarchicalNSW : public AlgorithmInterface { std::priority_queue, std::vector>, CompareByFirst> candidate_set; dist_t lowerBound; - if ((!has_deletions || !isMarkedDeleted(ep_id)) && ((!isIdAllowed) || (*isIdAllowed)(getExternalLabel(ep_id)))) { - dist_t dist = fstdistfunc_(data_point, getDataByInternalId(ep_id), dist_func_param_); + if (bare_bone_search || + (!isMarkedDeleted(ep_id) && ((!isIdAllowed) || (*isIdAllowed)(getExternalLabel(ep_id))))) { + char* ep_data = getDataByInternalId(ep_id); + dist_t dist = fstdistfunc_(data_point, ep_data, dist_func_param_); lowerBound = dist; top_candidates.emplace(dist, ep_id); + if (!bare_bone_search && stop_condition) { + stop_condition->add_point_to_result(getExternalLabel(ep_id), ep_data, dist); + } candidate_set.emplace(-dist, ep_id); } else { lowerBound = std::numeric_limits::max(); @@ -316,9 +341,19 @@ class HierarchicalNSW : public AlgorithmInterface { while (!candidate_set.empty()) { std::pair current_node_pair = candidate_set.top(); + dist_t candidate_dist = -current_node_pair.first; - if ((-current_node_pair.first) > lowerBound && - (top_candidates.size() == ef || (!isIdAllowed && !has_deletions))) { + bool flag_stop_search; + if (bare_bone_search) { + flag_stop_search = candidate_dist > lowerBound; + } else { + if (stop_condition) { + flag_stop_search = stop_condition->should_stop_search(candidate_dist, lowerBound); + } else { + flag_stop_search = candidate_dist > lowerBound && top_candidates.size() == ef; + } + } + if (flag_stop_search) { break; } candidate_set.pop(); @@ -353,7 +388,14 @@ class HierarchicalNSW : public AlgorithmInterface { char *currObj1 = (getDataByInternalId(candidate_id)); dist_t dist = fstdistfunc_(data_point, currObj1, dist_func_param_); - if (top_candidates.size() < ef || lowerBound > dist) { + bool flag_consider_candidate; + if (!bare_bone_search && stop_condition) { + flag_consider_candidate = stop_condition->should_consider_candidate(dist, lowerBound); + } else { + flag_consider_candidate = top_candidates.size() < ef || lowerBound > dist; + } + + if (flag_consider_candidate) { candidate_set.emplace(-dist, candidate_id); #ifdef USE_SSE _mm_prefetch(data_level0_memory_ + candidate_set.top().second * size_data_per_element_ + @@ -361,11 +403,30 @@ class HierarchicalNSW : public AlgorithmInterface { _MM_HINT_T0); //////////////////////// #endif - if ((!has_deletions || !isMarkedDeleted(candidate_id)) && ((!isIdAllowed) || (*isIdAllowed)(getExternalLabel(candidate_id)))) + if (bare_bone_search || + (!isMarkedDeleted(candidate_id) && ((!isIdAllowed) || (*isIdAllowed)(getExternalLabel(candidate_id))))) { top_candidates.emplace(dist, candidate_id); + if (!bare_bone_search && stop_condition) { + stop_condition->add_point_to_result(getExternalLabel(candidate_id), currObj1, dist); + } + } - if (top_candidates.size() > ef) + bool flag_remove_extra = false; + if (!bare_bone_search && stop_condition) { + flag_remove_extra = stop_condition->should_remove_extra(); + } else { + flag_remove_extra = top_candidates.size() > ef; + } + while (flag_remove_extra) { + tableint id = top_candidates.top().second; top_candidates.pop(); + if (!bare_bone_search && stop_condition) { + stop_condition->remove_point_from_result(getExternalLabel(id), getDataByInternalId(id), dist); + flag_remove_extra = stop_condition->should_remove_extra(); + } else { + flag_remove_extra = top_candidates.size() > ef; + } + } if (!top_candidates.empty()) lowerBound = top_candidates.top().first; @@ -380,8 +441,8 @@ class HierarchicalNSW : public AlgorithmInterface { void getNeighborsByHeuristic2( - std::priority_queue, std::vector>, CompareByFirst> &top_candidates, - const size_t M) { + std::priority_queue, std::vector>, CompareByFirst> &top_candidates, + const size_t M) { if (top_candidates.size() < M) { return; } @@ -573,8 +634,7 @@ class HierarchicalNSW : public AlgorithmInterface { if (new_max_elements < cur_element_count) throw std::runtime_error("Cannot resize, max element is less than the current number of elements"); - delete visited_list_pool_; - visited_list_pool_ = new VisitedListPool(1, new_max_elements); + visited_list_pool_.reset(new VisitedListPool(1, new_max_elements)); element_levels_.resize(new_max_elements); @@ -595,6 +655,32 @@ class HierarchicalNSW : public AlgorithmInterface { max_elements_ = new_max_elements; } + size_t indexFileSize() const { + size_t size = 0; + size += sizeof(offsetLevel0_); + size += sizeof(max_elements_); + size += sizeof(cur_element_count); + size += sizeof(size_data_per_element_); + size += sizeof(label_offset_); + size += sizeof(offsetData_); + size += sizeof(maxlevel_); + size += sizeof(enterpoint_node_); + size += sizeof(maxM_); + + size += sizeof(maxM0_); + size += sizeof(M_); + size += sizeof(mult_); + size += sizeof(ef_construction_); + + size += cur_element_count * size_data_per_element_; + + for (size_t i = 0; i < cur_element_count; i++) { + unsigned int linkListSize = element_levels_[i] > 0 ? size_links_per_element_ * element_levels_[i] : 0; + size += sizeof(linkListSize); + size += linkListSize; + } + return size; + } void saveIndex(const std::string &location) { std::ofstream output(location, std::ios::binary); @@ -633,6 +719,7 @@ class HierarchicalNSW : public AlgorithmInterface { if (!input.is_open()) throw std::runtime_error("Cannot open file"); + clear(); // get file size: input.seekg(0, input.end); std::streampos total_filesize = input.tellg(); @@ -698,7 +785,7 @@ class HierarchicalNSW : public AlgorithmInterface { std::vector(max_elements).swap(link_list_locks_); std::vector(MAX_LABEL_OPERATION_LOCKS).swap(label_op_locks_); - visited_list_pool_ = new VisitedListPool(1, max_elements); + visited_list_pool_.reset(new VisitedListPool(1, max_elements)); linkLists_ = (char **) malloc(sizeof(void *) * max_elements); if (linkLists_ == nullptr) @@ -752,7 +839,7 @@ class HierarchicalNSW : public AlgorithmInterface { size_t dim = *((size_t *) dist_func_param_); std::vector data; data_t* data_ptr = (data_t*) data_ptrv; - for (int i = 0; i < dim; i++) { + for (size_t i = 0; i < dim; i++) { data.push_back(*data_ptr); data_ptr += 1; } @@ -1216,11 +1303,12 @@ class HierarchicalNSW : public AlgorithmInterface { } std::priority_queue, std::vector>, CompareByFirst> top_candidates; - if (num_deleted_) { - top_candidates = searchBaseLayerST( + bool bare_bone_search = !num_deleted_ && !isIdAllowed; + if (bare_bone_search) { + top_candidates = searchBaseLayerST( currObj, query_data, std::max(ef_, k), isIdAllowed); } else { - top_candidates = searchBaseLayerST( + top_candidates = searchBaseLayerST( currObj, query_data, std::max(ef_, k), isIdAllowed); } @@ -1236,6 +1324,60 @@ class HierarchicalNSW : public AlgorithmInterface { } + std::vector> + searchStopConditionClosest( + const void *query_data, + BaseSearchStopCondition& stop_condition, + BaseFilterFunctor* isIdAllowed = nullptr) const { + std::vector> result; + if (cur_element_count == 0) return result; + + tableint currObj = enterpoint_node_; + dist_t curdist = fstdistfunc_(query_data, getDataByInternalId(enterpoint_node_), dist_func_param_); + + for (int level = maxlevel_; level > 0; level--) { + bool changed = true; + while (changed) { + changed = false; + unsigned int *data; + + data = (unsigned int *) get_linklist(currObj, level); + int size = getListCount(data); + metric_hops++; + metric_distance_computations+=size; + + tableint *datal = (tableint *) (data + 1); + for (int i = 0; i < size; i++) { + tableint cand = datal[i]; + if (cand < 0 || cand > max_elements_) + throw std::runtime_error("cand error"); + dist_t d = fstdistfunc_(query_data, getDataByInternalId(cand), dist_func_param_); + + if (d < curdist) { + curdist = d; + currObj = cand; + changed = true; + } + } + } + } + + std::priority_queue, std::vector>, CompareByFirst> top_candidates; + top_candidates = searchBaseLayerST(currObj, query_data, 0, isIdAllowed, &stop_condition); + + size_t sz = top_candidates.size(); + result.resize(sz); + while (!top_candidates.empty()) { + result[--sz] = top_candidates.top(); + top_candidates.pop(); + } + + stop_condition.filter_results(result); + + return result; + } + + void checkIntegrity() { int connections_checked = 0; std::vector inbound_connections_num(cur_element_count, 0); @@ -1246,7 +1388,6 @@ class HierarchicalNSW : public AlgorithmInterface { tableint *data = (tableint *) (ll_cur + 1); std::unordered_set s; for (int j = 0; j < size; j++) { - assert(data[j] > 0); assert(data[j] < cur_element_count); assert(data[j] != i); inbound_connections_num[data[j]]++; diff --git a/hnswlib/hnswlib.h b/hnswlib/hnswlib.h index fb7118fa..7ccfbba5 100644 --- a/hnswlib/hnswlib.h +++ b/hnswlib/hnswlib.h @@ -1,4 +1,13 @@ #pragma once + +// https://github.com/nmslib/hnswlib/pull/508 +// This allows others to provide their own error stream (e.g. RcppHNSW) +#ifndef HNSWLIB_ERR_OVERRIDE + #define HNSWERR std::cerr +#else + #define HNSWERR HNSWLIB_ERR_OVERRIDE +#endif + #ifndef NO_MANUAL_VECTORIZATION #if (defined(__SSE__) || _M_IX86_FP > 0 || defined(_M_AMD64) || defined(_M_X64)) #define USE_SSE @@ -15,7 +24,7 @@ #ifdef _MSC_VER #include #include -void cpuid(int32_t out[4], int32_t eax, int32_t ecx) { +static void cpuid(int32_t out[4], int32_t eax, int32_t ecx) { __cpuidex(out, eax, ecx); } static __int64 xgetbv(unsigned int x) { @@ -119,6 +128,25 @@ typedef size_t labeltype; class BaseFilterFunctor { public: virtual bool operator()(hnswlib::labeltype id) { return true; } + virtual ~BaseFilterFunctor() {}; +}; + +template +class BaseSearchStopCondition { + public: + virtual void add_point_to_result(labeltype label, const void *datapoint, dist_t dist) = 0; + + virtual void remove_point_from_result(labeltype label, const void *datapoint, dist_t dist) = 0; + + virtual bool should_stop_search(dist_t candidate_dist, dist_t lowerBound) = 0; + + virtual bool should_consider_candidate(dist_t candidate_dist, dist_t lowerBound) = 0; + + virtual bool should_remove_extra() = 0; + + virtual void filter_results(std::vector> &candidates) = 0; + + virtual ~BaseSearchStopCondition() {} }; template @@ -195,5 +223,6 @@ AlgorithmInterface::searchKnnCloserFirst(const void* query_data, size_t #include "space_l2.h" #include "space_ip.h" +#include "stop_condition.h" #include "bruteforce.h" #include "hnswalg.h" diff --git a/hnswlib/space_ip.h b/hnswlib/space_ip.h index 2b1c359e..0e6834c1 100644 --- a/hnswlib/space_ip.h +++ b/hnswlib/space_ip.h @@ -157,19 +157,44 @@ InnerProductSIMD16ExtAVX512(const void *pVect1v, const void *pVect2v, const void __m512 sum512 = _mm512_set1_ps(0); - while (pVect1 < pEnd1) { - //_mm_prefetch((char*)(pVect2 + 16), _MM_HINT_T0); - + size_t loop = qty16 / 4; + + while (loop--) { __m512 v1 = _mm512_loadu_ps(pVect1); - pVect1 += 16; __m512 v2 = _mm512_loadu_ps(pVect2); + pVect1 += 16; + pVect2 += 16; + + __m512 v3 = _mm512_loadu_ps(pVect1); + __m512 v4 = _mm512_loadu_ps(pVect2); + pVect1 += 16; + pVect2 += 16; + + __m512 v5 = _mm512_loadu_ps(pVect1); + __m512 v6 = _mm512_loadu_ps(pVect2); + pVect1 += 16; pVect2 += 16; - sum512 = _mm512_add_ps(sum512, _mm512_mul_ps(v1, v2)); + + __m512 v7 = _mm512_loadu_ps(pVect1); + __m512 v8 = _mm512_loadu_ps(pVect2); + pVect1 += 16; + pVect2 += 16; + + sum512 = _mm512_fmadd_ps(v1, v2, sum512); + sum512 = _mm512_fmadd_ps(v3, v4, sum512); + sum512 = _mm512_fmadd_ps(v5, v6, sum512); + sum512 = _mm512_fmadd_ps(v7, v8, sum512); } - _mm512_store_ps(TmpRes, sum512); - float sum = TmpRes[0] + TmpRes[1] + TmpRes[2] + TmpRes[3] + TmpRes[4] + TmpRes[5] + TmpRes[6] + TmpRes[7] + TmpRes[8] + TmpRes[9] + TmpRes[10] + TmpRes[11] + TmpRes[12] + TmpRes[13] + TmpRes[14] + TmpRes[15]; + while (pVect1 < pEnd1) { + __m512 v1 = _mm512_loadu_ps(pVect1); + __m512 v2 = _mm512_loadu_ps(pVect2); + pVect1 += 16; + pVect2 += 16; + sum512 = _mm512_fmadd_ps(v1, v2, sum512); + } + float sum = _mm512_reduce_add_ps(sum512); return sum; } diff --git a/hnswlib/stop_condition.h b/hnswlib/stop_condition.h new file mode 100644 index 00000000..acc80ebe --- /dev/null +++ b/hnswlib/stop_condition.h @@ -0,0 +1,276 @@ +#pragma once +#include "space_l2.h" +#include "space_ip.h" +#include +#include + +namespace hnswlib { + +template +class BaseMultiVectorSpace : public SpaceInterface { + public: + virtual DOCIDTYPE get_doc_id(const void *datapoint) = 0; + + virtual void set_doc_id(void *datapoint, DOCIDTYPE doc_id) = 0; +}; + + +template +class MultiVectorL2Space : public BaseMultiVectorSpace { + DISTFUNC fstdistfunc_; + size_t data_size_; + size_t vector_size_; + size_t dim_; + + public: + MultiVectorL2Space(size_t dim) { + fstdistfunc_ = L2Sqr; +#if defined(USE_SSE) || defined(USE_AVX) || defined(USE_AVX512) + #if defined(USE_AVX512) + if (AVX512Capable()) + L2SqrSIMD16Ext = L2SqrSIMD16ExtAVX512; + else if (AVXCapable()) + L2SqrSIMD16Ext = L2SqrSIMD16ExtAVX; + #elif defined(USE_AVX) + if (AVXCapable()) + L2SqrSIMD16Ext = L2SqrSIMD16ExtAVX; + #endif + + if (dim % 16 == 0) + fstdistfunc_ = L2SqrSIMD16Ext; + else if (dim % 4 == 0) + fstdistfunc_ = L2SqrSIMD4Ext; + else if (dim > 16) + fstdistfunc_ = L2SqrSIMD16ExtResiduals; + else if (dim > 4) + fstdistfunc_ = L2SqrSIMD4ExtResiduals; +#endif + dim_ = dim; + vector_size_ = dim * sizeof(float); + data_size_ = vector_size_ + sizeof(DOCIDTYPE); + } + + size_t get_data_size() override { + return data_size_; + } + + DISTFUNC get_dist_func() override { + return fstdistfunc_; + } + + void *get_dist_func_param() override { + return &dim_; + } + + DOCIDTYPE get_doc_id(const void *datapoint) override { + return *(DOCIDTYPE *)((char *)datapoint + vector_size_); + } + + void set_doc_id(void *datapoint, DOCIDTYPE doc_id) override { + *(DOCIDTYPE*)((char *)datapoint + vector_size_) = doc_id; + } + + ~MultiVectorL2Space() {} +}; + + +template +class MultiVectorInnerProductSpace : public BaseMultiVectorSpace { + DISTFUNC fstdistfunc_; + size_t data_size_; + size_t vector_size_; + size_t dim_; + + public: + MultiVectorInnerProductSpace(size_t dim) { + fstdistfunc_ = InnerProductDistance; +#if defined(USE_AVX) || defined(USE_SSE) || defined(USE_AVX512) + #if defined(USE_AVX512) + if (AVX512Capable()) { + InnerProductSIMD16Ext = InnerProductSIMD16ExtAVX512; + InnerProductDistanceSIMD16Ext = InnerProductDistanceSIMD16ExtAVX512; + } else if (AVXCapable()) { + InnerProductSIMD16Ext = InnerProductSIMD16ExtAVX; + InnerProductDistanceSIMD16Ext = InnerProductDistanceSIMD16ExtAVX; + } + #elif defined(USE_AVX) + if (AVXCapable()) { + InnerProductSIMD16Ext = InnerProductSIMD16ExtAVX; + InnerProductDistanceSIMD16Ext = InnerProductDistanceSIMD16ExtAVX; + } + #endif + #if defined(USE_AVX) + if (AVXCapable()) { + InnerProductSIMD4Ext = InnerProductSIMD4ExtAVX; + InnerProductDistanceSIMD4Ext = InnerProductDistanceSIMD4ExtAVX; + } + #endif + + if (dim % 16 == 0) + fstdistfunc_ = InnerProductDistanceSIMD16Ext; + else if (dim % 4 == 0) + fstdistfunc_ = InnerProductDistanceSIMD4Ext; + else if (dim > 16) + fstdistfunc_ = InnerProductDistanceSIMD16ExtResiduals; + else if (dim > 4) + fstdistfunc_ = InnerProductDistanceSIMD4ExtResiduals; +#endif + vector_size_ = dim * sizeof(float); + data_size_ = vector_size_ + sizeof(DOCIDTYPE); + } + + size_t get_data_size() override { + return data_size_; + } + + DISTFUNC get_dist_func() override { + return fstdistfunc_; + } + + void *get_dist_func_param() override { + return &dim_; + } + + DOCIDTYPE get_doc_id(const void *datapoint) override { + return *(DOCIDTYPE *)((char *)datapoint + vector_size_); + } + + void set_doc_id(void *datapoint, DOCIDTYPE doc_id) override { + *(DOCIDTYPE*)((char *)datapoint + vector_size_) = doc_id; + } + + ~MultiVectorInnerProductSpace() {} +}; + + +template +class MultiVectorSearchStopCondition : public BaseSearchStopCondition { + size_t curr_num_docs_; + size_t num_docs_to_search_; + size_t ef_collection_; + std::unordered_map doc_counter_; + std::priority_queue> search_results_; + BaseMultiVectorSpace& space_; + + public: + MultiVectorSearchStopCondition( + BaseMultiVectorSpace& space, + size_t num_docs_to_search, + size_t ef_collection = 10) + : space_(space) { + curr_num_docs_ = 0; + num_docs_to_search_ = num_docs_to_search; + ef_collection_ = std::max(ef_collection, num_docs_to_search); + } + + void add_point_to_result(labeltype label, const void *datapoint, dist_t dist) override { + DOCIDTYPE doc_id = space_.get_doc_id(datapoint); + if (doc_counter_[doc_id] == 0) { + curr_num_docs_ += 1; + } + search_results_.emplace(dist, doc_id); + doc_counter_[doc_id] += 1; + } + + void remove_point_from_result(labeltype label, const void *datapoint, dist_t dist) override { + DOCIDTYPE doc_id = space_.get_doc_id(datapoint); + doc_counter_[doc_id] -= 1; + if (doc_counter_[doc_id] == 0) { + curr_num_docs_ -= 1; + } + search_results_.pop(); + } + + bool should_stop_search(dist_t candidate_dist, dist_t lowerBound) override { + bool stop_search = candidate_dist > lowerBound && curr_num_docs_ == ef_collection_; + return stop_search; + } + + bool should_consider_candidate(dist_t candidate_dist, dist_t lowerBound) override { + bool flag_consider_candidate = curr_num_docs_ < ef_collection_ || lowerBound > candidate_dist; + return flag_consider_candidate; + } + + bool should_remove_extra() override { + bool flag_remove_extra = curr_num_docs_ > ef_collection_; + return flag_remove_extra; + } + + void filter_results(std::vector> &candidates) override { + while (curr_num_docs_ > num_docs_to_search_) { + dist_t dist_cand = candidates.back().first; + dist_t dist_res = search_results_.top().first; + assert(dist_cand == dist_res); + DOCIDTYPE doc_id = search_results_.top().second; + doc_counter_[doc_id] -= 1; + if (doc_counter_[doc_id] == 0) { + curr_num_docs_ -= 1; + } + search_results_.pop(); + candidates.pop_back(); + } + } + + ~MultiVectorSearchStopCondition() {} +}; + + +template +class EpsilonSearchStopCondition : public BaseSearchStopCondition { + float epsilon_; + size_t min_num_candidates_; + size_t max_num_candidates_; + size_t curr_num_items_; + + public: + EpsilonSearchStopCondition(float epsilon, size_t min_num_candidates, size_t max_num_candidates) { + assert(min_num_candidates <= max_num_candidates); + epsilon_ = epsilon; + min_num_candidates_ = min_num_candidates; + max_num_candidates_ = max_num_candidates; + curr_num_items_ = 0; + } + + void add_point_to_result(labeltype label, const void *datapoint, dist_t dist) override { + curr_num_items_ += 1; + } + + void remove_point_from_result(labeltype label, const void *datapoint, dist_t dist) override { + curr_num_items_ -= 1; + } + + bool should_stop_search(dist_t candidate_dist, dist_t lowerBound) override { + if (candidate_dist > lowerBound && curr_num_items_ == max_num_candidates_) { + // new candidate can't improve found results + return true; + } + if (candidate_dist > epsilon_ && curr_num_items_ >= min_num_candidates_) { + // new candidate is out of epsilon region and + // minimum number of candidates is checked + return true; + } + return false; + } + + bool should_consider_candidate(dist_t candidate_dist, dist_t lowerBound) override { + bool flag_consider_candidate = curr_num_items_ < max_num_candidates_ || lowerBound > candidate_dist; + return flag_consider_candidate; + } + + bool should_remove_extra() { + bool flag_remove_extra = curr_num_items_ > max_num_candidates_; + return flag_remove_extra; + } + + void filter_results(std::vector> &candidates) override { + while (!candidates.empty() && candidates.back().first > epsilon_) { + candidates.pop_back(); + } + while (candidates.size() > max_num_candidates_) { + candidates.pop_back(); + } + } + + ~EpsilonSearchStopCondition() {} +}; +} // namespace hnswlib diff --git a/python_bindings/bindings.cpp b/python_bindings/bindings.cpp index 5153bb58..dd09e80a 100644 --- a/python_bindings/bindings.cpp +++ b/python_bindings/bindings.cpp @@ -218,6 +218,9 @@ class Index { this->num_threads_default = num_threads; } + size_t indexFileSize() const { + return appr_alg->indexFileSize(); + } void saveIndex(const std::string &path_to_index) { appr_alg->saveIndex(path_to_index); @@ -301,7 +304,11 @@ class Index { } - std::vector> getDataReturnList(py::object ids_ = py::none()) { + py::object getData(py::object ids_ = py::none(), std::string return_type = "numpy") { + std::vector return_types{"numpy", "list"}; + if (std::find(std::begin(return_types), std::end(return_types), return_type) == std::end(return_types)) { + throw std::invalid_argument("return_type should be \"numpy\" or \"list\""); + } std::vector ids; if (!ids_.is_none()) { py::array_t < size_t, py::array::c_style | py::array::forcecast > items(ids_); @@ -322,7 +329,12 @@ class Index { for (auto id : ids) { data.push_back(appr_alg->template getDataByLabel(id)); } - return data; + if (return_type == "list") { + return py::cast(data); + } + if (return_type == "numpy") { + return py::array_t< data_t, py::array::c_style | py::array::forcecast >(py::cast(data)); + } } @@ -633,7 +645,7 @@ class Index { (void*)items.data(row), k, p_idFilter); if (result.size() != k) throw std::runtime_error( - "Cannot return the results in a contigious 2D array. Probably ef or M is too small"); + "Cannot return the results in a contiguous 2D array. Probably ef or M is too small"); for (int i = k - 1; i >= 0; i--) { auto& result_tuple = result.top(); data_numpy_d[row * k + i] = result_tuple.first; @@ -653,7 +665,7 @@ class Index { (void*)(norm_array.data() + start_idx), k, p_idFilter); if (result.size() != k) throw std::runtime_error( - "Cannot return the results in a contigious 2D array. Probably ef or M is too small"); + "Cannot return the results in a contiguous 2D array. Probably ef or M is too small"); for (int i = k - 1; i >= 0; i--) { auto& result_tuple = result.top(); data_numpy_d[row * k + i] = result_tuple.first; @@ -719,6 +731,7 @@ class BFIndex { int dim; bool index_inited; bool normalize; + int num_threads_default; hnswlib::labeltype cur_l; hnswlib::BruteforceSearch* alg; @@ -739,6 +752,8 @@ class BFIndex { } alg = NULL; index_inited = false; + + num_threads_default = std::thread::hardware_concurrency(); } @@ -749,6 +764,21 @@ class BFIndex { } + size_t getMaxElements() const { + return alg->maxelements_; + } + + + size_t getCurrentCount() const { + return alg->cur_element_count; + } + + + void set_num_threads(int num_threads) { + this->num_threads_default = num_threads; + } + + void init_new_index(const size_t maxElements) { if (alg) { throw std::runtime_error("The index is already initiated."); @@ -820,15 +850,19 @@ class BFIndex { py::object knnQuery_return_numpy( py::object input, size_t k = 1, + int num_threads = -1, const std::function& filter = nullptr) { py::array_t < dist_t, py::array::c_style | py::array::forcecast > items(input); auto buffer = items.request(); hnswlib::labeltype *data_numpy_l; dist_t *data_numpy_d; size_t rows, features; + + if (num_threads <= 0) + num_threads = num_threads_default; + { py::gil_scoped_release l; - get_input_array_shapes(buffer, &rows, &features); data_numpy_l = new hnswlib::labeltype[rows * k]; @@ -837,16 +871,16 @@ class BFIndex { CustomFilterFunctor idFilter(filter); CustomFilterFunctor* p_idFilter = filter ? &idFilter : nullptr; - for (size_t row = 0; row < rows; row++) { + ParallelFor(0, rows, num_threads, [&](size_t row, size_t threadId) { std::priority_queue> result = alg->searchKnn( - (void *) items.data(row), k, p_idFilter); + (void*)items.data(row), k, p_idFilter); for (int i = k - 1; i >= 0; i--) { - auto &result_tuple = result.top(); + auto& result_tuple = result.top(); data_numpy_d[row * k + i] = result_tuple.first; data_numpy_l[row * k + i] = result_tuple.second; result.pop(); } - } + }); } py::capsule free_when_done_l(data_numpy_l, [](void *f) { @@ -900,10 +934,11 @@ PYBIND11_PLUGIN(hnswlib) { py::arg("ids") = py::none(), py::arg("num_threads") = -1, py::arg("replace_deleted") = false) - .def("get_items", &Index::getDataReturnList, py::arg("ids") = py::none()) + .def("get_items", &Index::getData, py::arg("ids") = py::none(), py::arg("return_type") = "numpy") .def("get_ids_list", &Index::getIdsList) .def("set_ef", &Index::set_ef, py::arg("ef")) .def("set_num_threads", &Index::set_num_threads, py::arg("num_threads")) + .def("index_file_size", &Index::indexFileSize) .def("save_index", &Index::saveIndex, py::arg("path_to_index")) .def("load_index", &Index::loadIndex, @@ -957,13 +992,22 @@ PYBIND11_PLUGIN(hnswlib) { py::class_>(m, "BFIndex") .def(py::init(), py::arg("space"), py::arg("dim")) .def("init_index", &BFIndex::init_new_index, py::arg("max_elements")) - .def("knn_query", &BFIndex::knnQuery_return_numpy, py::arg("data"), py::arg("k") = 1, py::arg("filter") = py::none()) + .def("knn_query", + &BFIndex::knnQuery_return_numpy, + py::arg("data"), + py::arg("k") = 1, + py::arg("num_threads") = -1, + py::arg("filter") = py::none()) .def("add_items", &BFIndex::addItems, py::arg("data"), py::arg("ids") = py::none()) .def("delete_vector", &BFIndex::deleteVector, py::arg("label")) + .def("set_num_threads", &BFIndex::set_num_threads, py::arg("num_threads")) .def("save_index", &BFIndex::saveIndex, py::arg("path_to_index")) .def("load_index", &BFIndex::loadIndex, py::arg("path_to_index"), py::arg("max_elements") = 0) .def("__repr__", [](const BFIndex &a) { return ""; - }); + }) + .def("get_max_elements", &BFIndex::getMaxElements) + .def("get_current_count", &BFIndex::getCurrentCount) + .def_readwrite("num_threads", &BFIndex::num_threads_default); return m.ptr(); } diff --git a/python_bindings/tests/bindings_test_bf_index.py b/python_bindings/tests/bindings_test_bf_index.py new file mode 100644 index 00000000..060b9943 --- /dev/null +++ b/python_bindings/tests/bindings_test_bf_index.py @@ -0,0 +1,49 @@ +import unittest + +import numpy as np + +import hnswlib + + +class RandomSelfTestCase(unittest.TestCase): + def testBFIndex(self): + + dim = 16 + num_elements = 10000 + num_queries = 1000 + k = 20 + + # Generating sample data + data = np.float32(np.random.random((num_elements, dim))) + + # Declaring index + bf_index = hnswlib.BFIndex(space='l2', dim=dim) # possible options are l2, cosine or ip + bf_index.init_index(max_elements=num_elements) + + num_threads = 8 + bf_index.set_num_threads(num_threads) # by default using all available cores + + print(f"Adding all elements {num_elements}") + bf_index.add_items(data) + + self.assertEqual(bf_index.num_threads, num_threads) + self.assertEqual(bf_index.get_max_elements(), num_elements) + self.assertEqual(bf_index.get_current_count(), num_elements) + + queries = np.float32(np.random.random((num_queries, dim))) + print("Searching nearest neighbours") + labels, distances = bf_index.knn_query(queries, k=k) + + print("Checking results") + for i in range(num_queries): + query = queries[i] + sq_dists = (data - query)**2 + dists = np.sum(sq_dists, axis=1) + labels_gt = np.argsort(dists)[:k] + dists_gt = dists[labels_gt] + dists_bf = distances[i] + # we can compare labels but because of numeric errors in distance calculation in C++ and numpy + # sometimes we get different order of labels, therefore we compare distances + max_diff_with_gt = np.max(np.abs(dists_gt - dists_bf)) + + self.assertTrue(max_diff_with_gt < 1e-5) diff --git a/setup.py b/setup.py index 0126585e..d96aea49 100644 --- a/setup.py +++ b/setup.py @@ -8,7 +8,7 @@ from setuptools import Extension, setup from setuptools.command.build_ext import build_ext -__version__ = '0.7.0' +__version__ = '0.8.0' include_dirs = [ @@ -73,22 +73,20 @@ def cpp_flag(compiler): class BuildExt(build_ext): """A custom build extension for adding compiler-specific options.""" + compiler_flag_native = '-march=native' c_opts = { 'msvc': ['/EHsc', '/openmp', '/O2'], - #'unix': ['-O3', '-march=native'], # , '-w' - 'unix': ['-O3'], # , '-w' + 'unix': ['-O3', compiler_flag_native], # , '-w' } - if not os.environ.get("HNSWLIB_NO_NATIVE"): - c_opts['unix'].append('-march=native') - link_opts = { 'unix': [], 'msvc': [], } + if os.environ.get("HNSWLIB_NO_NATIVE"): + c_opts['unix'].remove(compiler_flag_native) + if sys.platform == 'darwin': - if platform.machine() == 'arm64': - c_opts['unix'].remove('-march=native') c_opts['unix'] += ['-stdlib=libc++', '-mmacosx-version-min=10.7'] link_opts['unix'] += ['-stdlib=libc++', '-mmacosx-version-min=10.7'] else: @@ -97,18 +95,35 @@ class BuildExt(build_ext): def build_extensions(self): ct = self.compiler.compiler_type - opts = self.c_opts.get(ct, []) + opts = BuildExt.c_opts.get(ct, []) if ct == 'unix': opts.append('-DVERSION_INFO="%s"' % self.distribution.get_version()) opts.append(cpp_flag(self.compiler)) if has_flag(self.compiler, '-fvisibility=hidden'): opts.append('-fvisibility=hidden') + if not os.environ.get("HNSWLIB_NO_NATIVE"): + # check that native flag is available + print('checking avalability of flag:', BuildExt.compiler_flag_native) + if not has_flag(self.compiler, BuildExt.compiler_flag_native): + print('removing unsupported compiler flag:', BuildExt.compiler_flag_native) + opts.remove(BuildExt.compiler_flag_native) + # for macos add apple-m1 flag if it's available + if sys.platform == 'darwin': + m1_flag = '-mcpu=apple-m1' + print('checking avalability of flag:', m1_flag) + if has_flag(self.compiler, m1_flag): + print('adding flag:', m1_flag) + opts.append(m1_flag) + else: + print(f'flag: {m1_flag} is not available') + else: + print(f'flag: {BuildExt.compiler_flag_native} is available') elif ct == 'msvc': opts.append('/DVERSION_INFO=\\"%s\\"' % self.distribution.get_version()) for ext in self.extensions: ext.extra_compile_args.extend(opts) - ext.extra_link_args.extend(self.link_opts.get(ct, [])) + ext.extra_link_args.extend(BuildExt.link_opts.get(ct, [])) build_ext.build_extensions(self) diff --git a/tests/cpp/epsilon_search_test.cpp b/tests/cpp/epsilon_search_test.cpp new file mode 100644 index 00000000..38df6246 --- /dev/null +++ b/tests/cpp/epsilon_search_test.cpp @@ -0,0 +1,114 @@ +#include "assert.h" +#include "../../hnswlib/hnswlib.h" + +typedef unsigned int docidtype; +typedef float dist_t; + +int main() { + int dim = 16; // Dimension of the elements + int max_elements = 10000; // Maximum number of elements, should be known beforehand + int M = 16; // Tightly connected with internal dimensionality of the data + // strongly affects the memory consumption + int ef_construction = 200; // Controls index search speed/build speed tradeoff + + int num_queries = 100; + float epsilon2 = 1.0; // Squared distance to query + int max_num_candidates = max_elements; // Upper bound on the number of returned elements in the epsilon region + int min_num_candidates = 2000; // Minimum number of candidates to search in the epsilon region + // this parameter is similar to ef + + // Initing index + hnswlib::L2Space space(dim); + hnswlib::BruteforceSearch* alg_brute = new hnswlib::BruteforceSearch(&space, max_elements); + hnswlib::HierarchicalNSW* alg_hnsw = new hnswlib::HierarchicalNSW(&space, max_elements, M, ef_construction); + + // Generate random data + std::mt19937 rng; + rng.seed(47); + std::uniform_real_distribution<> distrib_real; + + float* data = new float[dim * max_elements]; + for (int i = 0; i < dim * max_elements; i++) { + data[i] = distrib_real(rng); + } + + // Add data to index + std::cout << "Building index ...\n"; + for (int i = 0; i < max_elements; i++) { + hnswlib::labeltype label = i; + float* point_data = data + i * dim; + alg_hnsw->addPoint(point_data, label); + alg_brute->addPoint(point_data, label); + } + std::cout << "Index is ready\n"; + + // Query random vectors + for (int i = 0; i < num_queries; i++) { + float* query_data = new float[dim]; + for (int j = 0; j < dim; j++) { + query_data[j] = distrib_real(rng); + } + hnswlib::EpsilonSearchStopCondition stop_condition(epsilon2, min_num_candidates, max_num_candidates); + std::vector> result_hnsw = + alg_hnsw->searchStopConditionClosest(query_data, stop_condition); + + // check that returned results are in epsilon region + size_t num_vectors = result_hnsw.size(); + std::unordered_set hnsw_labels; + for (auto pair: result_hnsw) { + float dist = pair.first; + hnswlib::labeltype label = pair.second; + hnsw_labels.insert(label); + assert(dist >=0 && dist <= epsilon2); + } + std::priority_queue> result_brute = + alg_brute->searchKnn(query_data, max_elements); + + // check recall + std::unordered_set gt_labels; + while (!result_brute.empty()) { + float dist = result_brute.top().first; + hnswlib::labeltype label = result_brute.top().second; + if (dist < epsilon2) { + gt_labels.insert(label); + } + result_brute.pop(); + } + float correct = 0; + for (const auto& hnsw_label: hnsw_labels) { + if (gt_labels.find(hnsw_label) != gt_labels.end()) { + correct += 1; + } + } + if (gt_labels.size() == 0) { + assert(correct == 0); + continue; + } + float recall = correct / gt_labels.size(); + assert(recall > 0.95); + delete[] query_data; + } + std::cout << "Recall is OK\n"; + + // Query the elements for themselves and check that query can be found + float epsilon2_small = 0.0001f; + int min_candidates_small = 500; + for (size_t i = 0; i < max_elements; i++) { + hnswlib::EpsilonSearchStopCondition stop_condition(epsilon2_small, min_candidates_small, max_num_candidates); + std::vector> result = + alg_hnsw->searchStopConditionClosest(alg_hnsw->getDataByInternalId(i), stop_condition); + size_t num_vectors = result.size(); + // get closest distance + float dist = -1; + if (!result.empty()) { + dist = result[0].first; + } + assert(dist == 0); + } + std::cout << "Small epsilon search is OK\n"; + + delete[] data; + delete alg_brute; + delete alg_hnsw; + return 0; +} diff --git a/tests/cpp/multivector_search_test.cpp b/tests/cpp/multivector_search_test.cpp new file mode 100644 index 00000000..be783176 --- /dev/null +++ b/tests/cpp/multivector_search_test.cpp @@ -0,0 +1,126 @@ +#include +#include "../../hnswlib/hnswlib.h" + +typedef unsigned int docidtype; +typedef float dist_t; + +int main() { + int dim = 16; // Dimension of the elements + int max_elements = 1000; // Maximum number of elements, should be known beforehand + int M = 16; // Tightly connected with internal dimensionality of the data + // strongly affects the memory consumption + int ef_construction = 200; // Controls index search speed/build speed tradeoff + + int num_queries = 100; + int num_docs = 10; // Number of documents to search + int ef_collection = 15; // Number of candidate documents during the search + // Controlls the recall: higher ef leads to better accuracy, but slower search + docidtype min_doc_id = 0; + docidtype max_doc_id = 49; + + // Initing index + hnswlib::MultiVectorL2Space space(dim); + hnswlib::BruteforceSearch* alg_brute = new hnswlib::BruteforceSearch(&space, max_elements); + hnswlib::HierarchicalNSW* alg_hnsw = new hnswlib::HierarchicalNSW(&space, max_elements, M, ef_construction); + + // Generate random data + std::mt19937 rng; + rng.seed(47); + std::uniform_real_distribution<> distrib_real; + std::uniform_int_distribution distrib_docid(min_doc_id, max_doc_id); + + size_t data_point_size = space.get_data_size(); + char* data = new char[data_point_size * max_elements]; + for (int i = 0; i < max_elements; i++) { + // set vector value + char* point_data = data + i * data_point_size; + for (int j = 0; j < dim; j++) { + char* vec_data = point_data + j * sizeof(float); + float value = distrib_real(rng); + *(float*)vec_data = value; + } + // set document id + docidtype doc_id = distrib_docid(rng); + space.set_doc_id(point_data, doc_id); + } + + // Add data to index + std::unordered_map label_docid_lookup; + for (int i = 0; i < max_elements; i++) { + hnswlib::labeltype label = i; + char* point_data = data + i * data_point_size; + alg_hnsw->addPoint(point_data, label); + alg_brute->addPoint(point_data, label); + label_docid_lookup[label] = space.get_doc_id(point_data); + } + + // Query random vectors and check overall recall + float correct = 0; + float total_num_elements = 0; + size_t query_size = dim * sizeof(float); + for (int i = 0; i < num_queries; i++) { + char* query_data = new char[query_size]; + for (int j = 0; j < dim; j++) { + size_t offset = j * sizeof(float); + char* vec_data = query_data + offset; + float value = distrib_real(rng); + *(float*)vec_data = value; + } + hnswlib::MultiVectorSearchStopCondition stop_condition(space, num_docs, ef_collection); + std::vector> hnsw_results = + alg_hnsw->searchStopConditionClosest(query_data, stop_condition); + + // check number of found documents + std::unordered_set hnsw_docs; + std::unordered_set hnsw_labels; + for (auto pair: hnsw_results) { + hnswlib::labeltype label = pair.second; + hnsw_labels.emplace(label); + docidtype doc_id = label_docid_lookup[label]; + hnsw_docs.emplace(doc_id); + } + assert(hnsw_docs.size() == num_docs); + + // Check overall recall + std::vector> gt_results = + alg_brute->searchKnnCloserFirst(query_data, max_elements); + std::unordered_set gt_docs; + for (int i = 0; i < gt_results.size(); i++) { + if (gt_docs.size() == num_docs) { + break; + } + hnswlib::labeltype gt_label = gt_results[i].second; + if (hnsw_labels.find(gt_label) != hnsw_labels.end()) { + correct += 1; + } + docidtype gt_doc_id = label_docid_lookup[gt_label]; + gt_docs.emplace(gt_doc_id); + total_num_elements += 1; + } + delete[] query_data; + } + float recall = correct / total_num_elements; + std::cout << "random elements search recall : " << recall << "\n"; + assert(recall > 0.95); + + // Query the elements for themselves and measure recall + correct = 0; + for (int i = 0; i < max_elements; i++) { + hnswlib::MultiVectorSearchStopCondition stop_condition(space, num_docs, ef_collection); + std::vector> result = + alg_hnsw->searchStopConditionClosest(data + i * data_point_size, stop_condition); + hnswlib::labeltype label = -1; + if (!result.empty()) { + label = result[0].second; + } + if (label == i) correct++; + } + recall = correct / max_elements; + std::cout << "same elements search recall : " << recall << "\n"; + assert(recall > 0.99); + + delete[] data; + delete alg_brute; + delete alg_hnsw; + return 0; +} diff --git a/tests/cpp/sift_1b.cpp b/tests/cpp/sift_1b.cpp index 43777ff6..c0f296c2 100644 --- a/tests/cpp/sift_1b.cpp +++ b/tests/cpp/sift_1b.cpp @@ -250,11 +250,11 @@ void sift_test1B() { size_t vecdim = 128; char path_index[1024]; char path_gt[1024]; - char *path_q = "../bigann/bigann_query.bvecs"; - char *path_data = "../bigann/bigann_base.bvecs"; - sprintf(path_index, "sift1b_%dm_ef_%d_M_%d.bin", subset_size_milllions, efConstruction, M); + const char *path_q = "../bigann/bigann_query.bvecs"; + const char *path_data = "../bigann/bigann_base.bvecs"; + snprintf(path_index, sizeof(path_index), "sift1b_%dm_ef_%d_M_%d.bin", subset_size_milllions, efConstruction, M); - sprintf(path_gt, "../bigann/gnd/idx_%dM.ivecs", subset_size_milllions); + snprintf(path_gt, sizeof(path_gt), "../bigann/gnd/idx_%dM.ivecs", subset_size_milllions); unsigned char *massb = new unsigned char[vecdim]; diff --git a/tests/cpp/updates_test.cpp b/tests/cpp/updates_test.cpp index 52e1fa14..4dff2f85 100644 --- a/tests/cpp/updates_test.cpp +++ b/tests/cpp/updates_test.cpp @@ -239,7 +239,7 @@ int main(int argc, char **argv) { for (int b = 1; b < dummy_data_multiplier; b++) { std::cout << "Update iteration " << b << "\n"; char cpath[1024]; - sprintf(cpath, "batch_dummy_%02d.bin", b); + snprintf(cpath, sizeof(cpath), "batch_dummy_%02d.bin", b); std::vector dummy_batchb = load_batch(path + cpath, N * d); ParallelFor(0, N, num_threads, [&](size_t i, size_t threadId) { diff --git a/tests/python/bindings_test_getdata.py b/tests/python/bindings_test_getdata.py index 515ecebd..3e16f9b9 100644 --- a/tests/python/bindings_test_getdata.py +++ b/tests/python/bindings_test_getdata.py @@ -45,5 +45,11 @@ def testGettingItems(self): self.assertRaises(ValueError, lambda: p.get_items(labels[0])) # After adding them, all labels should be retrievable - returned_items = p.get_items(labels) - self.assertSequenceEqual(data.tolist(), returned_items) + returned_items_np = p.get_items(labels) + self.assertTrue((data == returned_items_np).all()) + + # check returned type of get_items + self.assertTrue(isinstance(returned_items_np, np.ndarray)) + returned_items_list = p.get_items(labels, return_type="list") + self.assertTrue(isinstance(returned_items_list, list)) + self.assertTrue(isinstance(returned_items_list[0], list)) diff --git a/tests/python/bindings_test_replace.py b/tests/python/bindings_test_replace.py index 80003a3a..09c1299e 100644 --- a/tests/python/bindings_test_replace.py +++ b/tests/python/bindings_test_replace.py @@ -94,10 +94,10 @@ def testRandomSelf(self): remaining_data = comb_data[remaining_labels_list] returned_items = hnsw_index.get_items(remaining_labels_list) - self.assertSequenceEqual(remaining_data.tolist(), returned_items) + self.assertTrue((remaining_data == returned_items).all()) returned_items = hnsw_index.get_items(labels3_tr) - self.assertSequenceEqual(data3_tr.tolist(), returned_items) + self.assertTrue((data3_tr == returned_items).all()) # Check index serialization # Delete batch 3 diff --git a/tests/python/draw_git_test_plots.py b/tests/python/draw_git_test_plots.py new file mode 100644 index 00000000..c91c8f5d --- /dev/null +++ b/tests/python/draw_git_test_plots.py @@ -0,0 +1,48 @@ +import os +import glob +import pandas as pd +import matplotlib.pyplot as plt +import numpy as np +def plot_data_from_file(file_path): + # Load the data, assuming the last column is text + data = pd.read_csv(file_path, header=None) + rep_size=len(set(data[data.columns[-1]])) + data.drop(data.columns[-1], axis=1, inplace=True) # Drop the last column (text) + + # Number of numerical columns + num_columns = data.shape[1] + + # Create a subplot for each column + fig, axes = plt.subplots(num_columns, 1, figsize=(10, 6 * num_columns)) + + # In case there is only one column, axes will not be an array, so we convert it + if num_columns == 1: + axes = [axes] + + for i, ax in enumerate(axes): + idx=0 + ax.scatter(np.asarray(data.index,dtype=np.int64)%rep_size, data[i], label=f'Column {i+1}') + ax.set_title(f'Column {i+1}') + ax.set_xlabel('ID Number') + ax.set_ylabel('Value') + ax.legend() + ax.grid(True) + + plt.tight_layout() + plt.suptitle(f'Data from {os.path.basename(file_path)}') + + # Save the plot to a file + plt.savefig(file_path.replace('.txt', '.png')) + plt.close() + +def scan_and_plot(directory): + # Scan for .txt files in the given directory + txt_files = glob.glob(os.path.join(directory, '*.txt')) + + # Process each file + for file in txt_files: + print(f'Processing {file}...') + plot_data_from_file(file) + print(f'Plot saved for {file}') +# Replace 'your_folder_path' with the path to the folder containing the .txt files +scan_and_plot('./') \ No newline at end of file diff --git a/tests/python/git_tester.py b/tests/python/git_tester.py index 1f9c2ba7..e7657fee 100644 --- a/tests/python/git_tester.py +++ b/tests/python/git_tester.py @@ -9,16 +9,18 @@ speedtest_copy_path = os.path.join("tests", "python", "speedtest2.py") shutil.copyfile(speedtest_src_path, speedtest_copy_path) # the file has to be outside of git -commits = list(Repository('.', from_tag="v0.6.2").traverse_commits()) +commits = list(Repository('.', from_tag="v0.7.0").traverse_commits()) print("Found commits:") for idx, commit in enumerate(commits): name = commit.msg.replace('\n', ' ').replace('\r', ' ') print(idx, commit.hash, name) for commit in commits: - name = commit.msg.replace('\n', ' ').replace('\r', ' ').replace(",", ";") + commit_time = commit.author_date.strftime("%Y-%m-%d %H:%M:%S") + author_name = commit.author.name + name = "auth:"+author_name+"_"+commit_time+"_msg:"+commit.msg.replace('\n', ' ').replace('\r', ' ').replace(",", ";") print("\nProcessing", commit.hash, name) - + if os.path.exists("build"): shutil.rmtree("build") os.system(f"git checkout {commit.hash}") @@ -43,10 +45,11 @@ print("build failed!!!!") continue - # os.system(f'python {speedtest_copy_path} -n "{hash[:4]}_{name}" -d 32 -t 1') + os.system(f'python {speedtest_copy_path} -n "{commit.hash[:4]}_{name}" -d 16 -t 1') os.system(f'python {speedtest_copy_path} -n "{commit.hash[:4]}_{name}" -d 16 -t 64') - # os.system(f'python {speedtest_copy_path} -n "{name}" -d 64 -t 1') - # os.system(f'python {speedtest_copy_path} -n "{name}" -d 128 -t 1') - # os.system(f'python {speedtest_copy_path} -n "{name}" -d 4 -t 24') - # os.system(f'python {speedtest_copy_path} -n "{name}" -d 128 -t 24') + os.system(f'python {speedtest_copy_path} -n "{commit.hash[:4]}_{name}" -d 4 -t 1') + os.system(f'python {speedtest_copy_path} -n "{commit.hash[:4]}_{name}" -d 4 -t 64') + os.system(f'python {speedtest_copy_path} -n "{commit.hash[:4]}_{name}" -d 128 -t 1') + os.system(f'python {speedtest_copy_path} -n "{commit.hash[:4]}_{name}" -d 128 -t 64') +