Skip to content

Commit 054b27f

Browse files
datamap iter test OK. skiplist in dev-dependency
1 parent ec2145f commit 054b27f

File tree

2 files changed

+71
-14
lines changed

2 files changed

+71
-14
lines changed

Diff for: Cargo.toml

+2-1
Original file line numberDiff line numberDiff line change
@@ -70,7 +70,7 @@ num-traits = { version = "0.2" }
7070

7171
# for hashing . hashbrown still needed beccause of get_key_value(&key)
7272
hashbrown = { version = "0.14" }
73-
skiplist = { version = "0.5" }
73+
indexmap = { version = "2.2" }
7474

7575
rand = { version = "0.8" }
7676
lazy_static = { version = "1.4" }
@@ -95,6 +95,7 @@ simdeez = { version = "1.0", optional = true }
9595
[dev-dependencies]
9696
hdf5 = { version = "0.8" }
9797
ndarray = { version = "0.15" }
98+
skiplist = { version = "0.5" }
9899

99100

100101
[features]

Diff for: src/datamap.rs

+69-13
Original file line numberDiff line numberDiff line change
@@ -4,14 +4,12 @@
44
//! - a Hashmap from DataId to address
55
//! - an interface for retrieving just data vectors loaded in the hnsw structure.
66
7-
#![allow(unused)]
8-
97
use std::io::BufReader;
108

119
use std::fs::{File, OpenOptions};
1210
use std::path::PathBuf;
1311

14-
use hashbrown::{hash_map::Keys, HashMap};
12+
use indexmap::map::IndexMap;
1513
use log::log_enabled;
1614
use mmap_rs::{Mmap, MmapOptions};
1715

@@ -24,11 +22,11 @@ use crate::hnswio::MAGICDATAP;
2422
// possibly to be used in graph to spare memory?
2523
pub struct DataMap {
2624
/// File containing Points data
27-
datapath: PathBuf,
25+
_datapath: PathBuf,
2826
/// The mmap structure
2927
mmap: Mmap,
3028
/// map a dataId to an address where we get a bson encoded vector of type T
31-
hmap: HashMap<DataId, usize>,
29+
hmap: IndexMap<DataId, usize>,
3230
/// type name of Data
3331
t_name: String,
3432
/// dimension of data vector
@@ -156,7 +154,7 @@ impl DataMap {
156154
let nb_record = residual / record_size;
157155
log::debug!("record size : {}, nb_record : {}", record_size, nb_record);
158156
// allocate hmap with correct capacity
159-
let mut hmap = HashMap::<DataId, usize>::with_capacity(nb_record);
157+
let mut hmap = IndexMap::<DataId, usize>::with_capacity(nb_record);
160158
// fill hmap to have address of each data point in file
161159
let mut u64_slice = [0u8; std::mem::size_of::<u64>()];
162160
//
@@ -213,7 +211,7 @@ impl DataMap {
213211
log::debug!("\n end of DataMap::from_hnsw \n");
214212
//
215213
let datamap = DataMap {
216-
datapath,
214+
_datapath: datapath,
217215
mmap,
218216
hmap,
219217
t_name,
@@ -258,7 +256,8 @@ impl DataMap {
258256
}
259257
} // end of check_data_type
260258

261-
/// return the data corresponding to dataid. Access is done via mmap. returns None if address is invalid
259+
/// return the data corresponding to dataid. Access is done using mmap.
260+
/// Function returns None if address is invalid
262261
pub fn get_data<'a, T: Clone + std::fmt::Debug>(&'a self, dataid: &DataId) -> Option<&'a [T]> {
263262
//
264263
log::trace!("in DataMap::get_data, dataid : {:?}", dataid);
@@ -285,8 +284,9 @@ impl DataMap {
285284
Some(slice_t)
286285
}
287286

288-
/// returns an iterator
289-
pub fn get_dataid_iter(&self) -> Keys<DataId, usize> {
287+
/// returns Keys in order they are in the file, thus optimizing file/memory access.
288+
/// Note that in case of parallel insertion this can be different from insertion odrer.
289+
pub fn get_dataid_iter(&self) -> indexmap::map::Keys<DataId, usize> {
290290
return self.hmap.keys();
291291
}
292292
} // end of impl DataMap
@@ -379,10 +379,66 @@ mod tests {
379379
// test iterator from datamap
380380
let keys = datamap.get_dataid_iter();
381381
for k in keys {
382-
let data = datamap.get_data::<f32>(k);
382+
let _data = datamap.get_data::<f32>(k);
383383
}
384384
// rm files generated!
385-
std::fs::remove_file("mmap_test.hnsw.data");
386-
std::fs::remove_file("mmap_test.hnsw.graph");
385+
let _ = std::fs::remove_file("mmap_test.hnsw.data");
386+
let _ = std::fs::remove_file("mmap_test.hnsw.graph");
387387
} // end of test_file_mmap
388+
389+
#[test]
390+
fn test_mmap_iter() {
391+
log_init_test();
392+
// generate a random test
393+
let mut rng = rand::thread_rng();
394+
let unif = Uniform::<u32>::new(0, 10000);
395+
// 1000 vectors of size 10 f32
396+
let nbcolumn = 50;
397+
let nbrow = 11;
398+
let mut xsi;
399+
let mut data = Vec::with_capacity(nbcolumn);
400+
for j in 0..nbcolumn {
401+
data.push(Vec::with_capacity(nbrow));
402+
for _ in 0..nbrow {
403+
xsi = unif.sample(&mut rng);
404+
data[j].push(xsi);
405+
}
406+
log::debug!("j : {:?}, data : {:?} ", j, &data[j]);
407+
}
408+
// define hnsw
409+
let ef_construct = 25;
410+
let nb_connection = 10;
411+
let hnsw = Hnsw::<u32, dist::DistL1>::new(
412+
nb_connection,
413+
nbcolumn,
414+
16,
415+
ef_construct,
416+
dist::DistL1 {},
417+
);
418+
for i in 0..data.len() {
419+
hnsw.insert((&data[i], i));
420+
}
421+
// some loggin info
422+
hnsw.dump_layer_info();
423+
// dump in a file. Must take care of name as tests runs in // !!!
424+
let fname = String::from("mmap_order_test");
425+
let _res = hnsw.file_dump(&fname);
426+
// now we have check that datamap seems ok, test reload of hnsw with mmap
427+
let datamap: DataMap = DataMap::from_hnswdump::<u32>(".", &fname).unwrap();
428+
// testing type check
429+
assert!(datamap.check_data_type::<u32>());
430+
assert!(!datamap.check_data_type::<f32>());
431+
log::info!("Datamap iteration order checking");
432+
let keys = datamap.get_dataid_iter();
433+
let mut ukey = 0usize;
434+
for dataid in keys {
435+
let v = datamap.get_data::<u32>(dataid).unwrap();
436+
assert_eq!(v, &data[*dataid], "dataid = {}, ukey = {}", dataid, ukey);
437+
ukey += 1;
438+
}
439+
// rm files generated!
440+
let _ = std::fs::remove_file("mmap_order_test.hnsw.data");
441+
let _ = std::fs::remove_file("mmap_order_test.hnsw.graph");
442+
}
443+
//
388444
} // end of mod tests

0 commit comments

Comments
 (0)