4
4
//! - a Hashmap from DataId to address
5
5
//! - an interface for retrieving just data vectors loaded in the hnsw structure.
6
6
7
- #![ allow( unused) ]
8
-
9
7
use std:: io:: BufReader ;
10
8
11
9
use std:: fs:: { File , OpenOptions } ;
12
10
use std:: path:: PathBuf ;
13
11
14
- use hashbrown :: { hash_map :: Keys , HashMap } ;
12
+ use indexmap :: map :: IndexMap ;
15
13
use log:: log_enabled;
16
14
use mmap_rs:: { Mmap , MmapOptions } ;
17
15
@@ -24,11 +22,11 @@ use crate::hnswio::MAGICDATAP;
24
22
// possibly to be used in graph to spare memory?
25
23
pub struct DataMap {
26
24
/// File containing Points data
27
- datapath : PathBuf ,
25
+ _datapath : PathBuf ,
28
26
/// The mmap structure
29
27
mmap : Mmap ,
30
28
/// map a dataId to an address where we get a bson encoded vector of type T
31
- hmap : HashMap < DataId , usize > ,
29
+ hmap : IndexMap < DataId , usize > ,
32
30
/// type name of Data
33
31
t_name : String ,
34
32
/// dimension of data vector
@@ -156,7 +154,7 @@ impl DataMap {
156
154
let nb_record = residual / record_size;
157
155
log:: debug!( "record size : {}, nb_record : {}" , record_size, nb_record) ;
158
156
// allocate hmap with correct capacity
159
- let mut hmap = HashMap :: < DataId , usize > :: with_capacity ( nb_record) ;
157
+ let mut hmap = IndexMap :: < DataId , usize > :: with_capacity ( nb_record) ;
160
158
// fill hmap to have address of each data point in file
161
159
let mut u64_slice = [ 0u8 ; std:: mem:: size_of :: < u64 > ( ) ] ;
162
160
//
@@ -213,7 +211,7 @@ impl DataMap {
213
211
log:: debug!( "\n end of DataMap::from_hnsw \n " ) ;
214
212
//
215
213
let datamap = DataMap {
216
- datapath,
214
+ _datapath : datapath,
217
215
mmap,
218
216
hmap,
219
217
t_name,
@@ -258,7 +256,8 @@ impl DataMap {
258
256
}
259
257
} // end of check_data_type
260
258
261
- /// return the data corresponding to dataid. Access is done via mmap. returns None if address is invalid
259
+ /// return the data corresponding to dataid. Access is done using mmap.
260
+ /// Function returns None if address is invalid
262
261
pub fn get_data < ' a , T : Clone + std:: fmt:: Debug > ( & ' a self , dataid : & DataId ) -> Option < & ' a [ T ] > {
263
262
//
264
263
log:: trace!( "in DataMap::get_data, dataid : {:?}" , dataid) ;
@@ -285,8 +284,9 @@ impl DataMap {
285
284
Some ( slice_t)
286
285
}
287
286
288
- /// returns an iterator
289
- pub fn get_dataid_iter ( & self ) -> Keys < DataId , usize > {
287
+ /// returns Keys in order they are in the file, thus optimizing file/memory access.
288
+ /// Note that in case of parallel insertion this can be different from insertion odrer.
289
+ pub fn get_dataid_iter ( & self ) -> indexmap:: map:: Keys < DataId , usize > {
290
290
return self . hmap . keys ( ) ;
291
291
}
292
292
} // end of impl DataMap
@@ -379,10 +379,66 @@ mod tests {
379
379
// test iterator from datamap
380
380
let keys = datamap. get_dataid_iter ( ) ;
381
381
for k in keys {
382
- let data = datamap. get_data :: < f32 > ( k) ;
382
+ let _data = datamap. get_data :: < f32 > ( k) ;
383
383
}
384
384
// rm files generated!
385
- std:: fs:: remove_file ( "mmap_test.hnsw.data" ) ;
386
- std:: fs:: remove_file ( "mmap_test.hnsw.graph" ) ;
385
+ let _ = std:: fs:: remove_file ( "mmap_test.hnsw.data" ) ;
386
+ let _ = std:: fs:: remove_file ( "mmap_test.hnsw.graph" ) ;
387
387
} // end of test_file_mmap
388
+
389
+ #[ test]
390
+ fn test_mmap_iter ( ) {
391
+ log_init_test ( ) ;
392
+ // generate a random test
393
+ let mut rng = rand:: thread_rng ( ) ;
394
+ let unif = Uniform :: < u32 > :: new ( 0 , 10000 ) ;
395
+ // 1000 vectors of size 10 f32
396
+ let nbcolumn = 50 ;
397
+ let nbrow = 11 ;
398
+ let mut xsi;
399
+ let mut data = Vec :: with_capacity ( nbcolumn) ;
400
+ for j in 0 ..nbcolumn {
401
+ data. push ( Vec :: with_capacity ( nbrow) ) ;
402
+ for _ in 0 ..nbrow {
403
+ xsi = unif. sample ( & mut rng) ;
404
+ data[ j] . push ( xsi) ;
405
+ }
406
+ log:: debug!( "j : {:?}, data : {:?} " , j, & data[ j] ) ;
407
+ }
408
+ // define hnsw
409
+ let ef_construct = 25 ;
410
+ let nb_connection = 10 ;
411
+ let hnsw = Hnsw :: < u32 , dist:: DistL1 > :: new (
412
+ nb_connection,
413
+ nbcolumn,
414
+ 16 ,
415
+ ef_construct,
416
+ dist:: DistL1 { } ,
417
+ ) ;
418
+ for i in 0 ..data. len ( ) {
419
+ hnsw. insert ( ( & data[ i] , i) ) ;
420
+ }
421
+ // some loggin info
422
+ hnsw. dump_layer_info ( ) ;
423
+ // dump in a file. Must take care of name as tests runs in // !!!
424
+ let fname = String :: from ( "mmap_order_test" ) ;
425
+ let _res = hnsw. file_dump ( & fname) ;
426
+ // now we have check that datamap seems ok, test reload of hnsw with mmap
427
+ let datamap: DataMap = DataMap :: from_hnswdump :: < u32 > ( "." , & fname) . unwrap ( ) ;
428
+ // testing type check
429
+ assert ! ( datamap. check_data_type:: <u32 >( ) ) ;
430
+ assert ! ( !datamap. check_data_type:: <f32 >( ) ) ;
431
+ log:: info!( "Datamap iteration order checking" ) ;
432
+ let keys = datamap. get_dataid_iter ( ) ;
433
+ let mut ukey = 0usize ;
434
+ for dataid in keys {
435
+ let v = datamap. get_data :: < u32 > ( dataid) . unwrap ( ) ;
436
+ assert_eq ! ( v, & data[ * dataid] , "dataid = {}, ukey = {}" , dataid, ukey) ;
437
+ ukey += 1 ;
438
+ }
439
+ // rm files generated!
440
+ let _ = std:: fs:: remove_file ( "mmap_order_test.hnsw.data" ) ;
441
+ let _ = std:: fs:: remove_file ( "mmap_order_test.hnsw.graph" ) ;
442
+ }
443
+ //
388
444
} // end of mod tests
0 commit comments