Skip to content

Commit

Permalink
Add simplified functions for product quantization (#514)
Browse files Browse the repository at this point in the history
* Add simplified functions for product quantization

* Fixing formatting errors

* Fixing clang-format issue

* Fixing another set of clang-format issues

---------

Co-authored-by: Michael Popov (from Dev Box) <[email protected]>
  • Loading branch information
michael-popov and Michael Popov (from Dev Box) authored Feb 27, 2024
1 parent 340bc58 commit a25ee6f
Show file tree
Hide file tree
Showing 2 changed files with 136 additions and 0 deletions.
8 changes: 8 additions & 0 deletions include/pq.h
Original file line number Diff line number Diff line change
Expand Up @@ -67,11 +67,19 @@ DISKANN_DLLEXPORT int generate_opq_pivots(const float *train_data, size_t num_tr
unsigned num_pq_chunks, std::string opq_pivots_path,
bool make_zero_mean = false);

DISKANN_DLLEXPORT int generate_pq_pivots_simplified(const float *train_data, size_t num_train, size_t dim,
size_t num_pq_chunks, std::vector<float> &pivot_data_vector);

template <typename T>
int generate_pq_data_from_pivots(const std::string &data_file, unsigned num_centers, unsigned num_pq_chunks,
const std::string &pq_pivots_path, const std::string &pq_compressed_vectors_path,
bool use_opq = false);

DISKANN_DLLEXPORT int generate_pq_data_from_pivots_simplified(const float *data, const size_t num,
const float *pivot_data, const size_t pivots_num,
const size_t dim, const size_t num_pq_chunks,
std::vector<uint8_t> &pq);

template <typename T>
void generate_disk_quantized_data(const std::string &data_file_to_use, const std::string &disk_pq_pivots_path,
const std::string &disk_pq_compressed_vectors_path,
Expand Down
128 changes: 128 additions & 0 deletions src/pq.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -344,6 +344,65 @@ void pq_dist_lookup(const uint8_t *pq_ids, const size_t n_pts, const size_t pq_n
}
}

// generate_pq_pivots_simplified is a simplified version of generate_pq_pivots.
// Input is provided in the in-memory buffer train_data.
// Output is stored in the in-memory buffer pivot_data_vector.
// Simplification is based on the following assumptions:
// dim % num_pq_chunks == 0
// num_centers == 256 by default
// KMEANS_ITERS_FOR_PQ == 15 by default
// make_zero_mean is false by default.
// These assumptions allow to make the function much simpler and avoid storing
// array of chunk_offsets and centroids.
// The compiler pragma for multi-threading support is removed from this implementation
// for the purpose of integration into systems that strictly control resource allocation.
int generate_pq_pivots_simplified(const float *train_data, size_t num_train, size_t dim, size_t num_pq_chunks,
std::vector<float> &pivot_data_vector)
{
if (num_pq_chunks > dim || dim % num_pq_chunks != 0)
{
return -1;
}

const size_t num_centers = 256;
const size_t cur_chunk_size = dim / num_pq_chunks;
const uint32_t KMEANS_ITERS_FOR_PQ = 15;

pivot_data_vector.resize(num_centers * dim);
std::vector<float> cur_pivot_data_vector(num_centers * cur_chunk_size);
std::vector<float> cur_data_vector(num_train * cur_chunk_size);
std::vector<uint32_t> closest_center_vector(num_train);

float *pivot_data = &pivot_data_vector[0];
float *cur_pivot_data = &cur_pivot_data_vector[0];
float *cur_data = &cur_data_vector[0];
uint32_t *closest_center = &closest_center_vector[0];

for (size_t i = 0; i < num_pq_chunks; i++)
{
size_t chunk_offset = cur_chunk_size * i;

for (int32_t j = 0; j < num_train; j++)
{
std::memcpy(cur_data + j * cur_chunk_size, train_data + j * dim + chunk_offset,
cur_chunk_size * sizeof(float));
}

kmeans::kmeanspp_selecting_pivots(cur_data, num_train, cur_chunk_size, cur_pivot_data, num_centers);

kmeans::run_lloyds(cur_data, num_train, cur_chunk_size, cur_pivot_data, num_centers, KMEANS_ITERS_FOR_PQ, NULL,
closest_center);

for (uint64_t j = 0; j < num_centers; j++)
{
std::memcpy(pivot_data + j * dim + chunk_offset, cur_pivot_data + j * cur_chunk_size,
cur_chunk_size * sizeof(float));
}
}

return 0;
}

// given training data in train_data of dimensions num_train * dim, generate
// PQ pivots using k-means algorithm to partition the co-ordinates into
// num_pq_chunks (if it divides dimension, else rounded) chunks, and runs
Expand Down Expand Up @@ -712,6 +771,75 @@ int generate_opq_pivots(const float *passed_train_data, size_t num_train, uint32
return 0;
}

// generate_pq_data_from_pivots_simplified is a simplified version of generate_pq_data_from_pivots.
// Input is provided in the in-memory buffers data and pivot_data.
// Output is stored in the in-memory buffer pq.
// Simplification is based on the following assumptions:
// supporting only float data type
// dim % num_pq_chunks == 0, which results in a fixed chunk_size
// num_centers == 256 by default
// make_zero_mean is false by default.
// These assumptions allow to make the function much simpler and avoid using
// array of chunk_offsets and centroids.
// The compiler pragma for multi-threading support is removed from this implementation
// for the purpose of integration into systems that strictly control resource allocation.
int generate_pq_data_from_pivots_simplified(const float *data, const size_t num, const float *pivot_data,
const size_t pivots_num, const size_t dim, const size_t num_pq_chunks,
std::vector<uint8_t> &pq)
{
if (num_pq_chunks == 0 || num_pq_chunks > dim || dim % num_pq_chunks != 0)
{
return -1;
}

const size_t num_centers = 256;
const size_t chunk_size = dim / num_pq_chunks;

if (pivots_num != num_centers * dim)
{
return -1;
}

pq.resize(num * num_pq_chunks);

std::vector<float> cur_pivot_vector(num_centers * chunk_size);
std::vector<float> cur_data_vector(num * chunk_size);
std::vector<uint32_t> closest_center_vector(num);

float *cur_pivot_data = &cur_pivot_vector[0];
float *cur_data = &cur_data_vector[0];
uint32_t *closest_center = &closest_center_vector[0];

for (size_t i = 0; i < num_pq_chunks; i++)
{
const size_t chunk_offset = chunk_size * i;

for (int j = 0; j < num_centers; j++)
{
std::memcpy(cur_pivot_data + j * chunk_size, pivot_data + j * dim + chunk_offset,
chunk_size * sizeof(float));
}

for (int j = 0; j < num; j++)
{
for (size_t k = 0; k < chunk_size; k++)
{
cur_data[j * chunk_size + k] = data[j * dim + chunk_offset + k];
}
}

math_utils::compute_closest_centers(cur_data, num, chunk_size, cur_pivot_data, num_centers, 1, closest_center);

for (int j = 0; j < num; j++)
{
assert(closest_center[j] < num_centers);
pq[j * num_pq_chunks + i] = closest_center[j];
}
}

return 0;
}

// streams the base file (data_file), and computes the closest centers in each
// chunk to generate the compressed data_file and stores it in
// pq_compressed_vectors_path.
Expand Down

0 comments on commit a25ee6f

Please sign in to comment.