@@ -7,12 +7,17 @@ pub mod utils;
7
7
8
8
#[ cfg( test) ]
9
9
mod tests {
10
+ use std:: collections:: HashSet ;
11
+ use std:: default;
10
12
use std:: hash:: Hasher ;
11
13
12
14
use std:: time:: Instant ;
13
15
16
+ use needletail:: Sequence ;
14
17
use rand_xoshiro:: Xoroshiro64Star ;
18
+ use rayon:: iter:: Enumerate ;
15
19
use wyhash:: { wyrng, WyRng } ;
20
+ use xxhash_rust:: xxh3:: xxh3_64_with_seed;
16
21
17
22
use crate :: hd;
18
23
use crate :: {
@@ -125,12 +130,35 @@ mod tests {
125
130
while let Some ( record) = fastx_reader. next ( ) {
126
131
let seq = record. expect ( "invalid record" ) ;
127
132
128
- for i in seq. bit_kmers ( 17 , true ) {
133
+ println ! ( "{:?}" , String :: from_utf8( seq. seq( ) . to_vec( ) ) ) ;
134
+ for i in seq. bit_kmers ( 5 , true ) {
129
135
println ! ( "{:?}\t {:?}" , i, 0 ) ;
130
136
}
131
137
}
132
138
}
133
139
140
+ #[ test]
141
+ fn test_fast_xxhash ( ) {
142
+ use needletail:: parse_fastx_file;
143
+ use xxhash_rust:: xxh3:: xxh3_64;
144
+
145
+ let file = "./test/test.fna" ;
146
+ let mut fastx_reader = parse_fastx_file ( & file) . expect ( "Opening .fna files failed" ) ;
147
+
148
+ while let Some ( record) = fastx_reader. next ( ) {
149
+ let seq = record. expect ( "invalid record" ) ;
150
+ let norm_seq = seq. normalize ( false ) ;
151
+
152
+ let rc = norm_seq. reverse_complement ( ) ;
153
+
154
+ println ! ( "{:?}" , String :: from_utf8( norm_seq. to_vec( ) ) ) ;
155
+ for ( _, kmer, _) in norm_seq. canonical_kmers ( 5 , & rc) {
156
+ let h = xxh3_64 ( kmer) ;
157
+ println ! ( "{:?}\t {:?}" , String :: from_utf8( kmer. to_vec( ) ) , h) ;
158
+ }
159
+ }
160
+ }
161
+
134
162
#[ test]
135
163
fn test_vec_norm ( ) {
136
164
let a = vec ! [ 1 , 2 , 3 , -4 ] ;
@@ -265,4 +293,134 @@ mod tests {
265
293
) ;
266
294
assert_eq ! ( & hv, & hv_decompressed) ;
267
295
}
296
+
297
+ #[ test]
298
+ fn test_xxhash_rng ( ) {
299
+ let mut rng = xxhash_rust:: xxh3:: Xxh3 :: with_seed ( 1 ) ;
300
+
301
+ for i in 0 ..4 {
302
+ let h = rng. digest ( ) ;
303
+ rng. write_u64 ( h) ;
304
+ println ! ( "{}" , h) ;
305
+ }
306
+ }
307
+
308
+ #[ test]
309
+ fn test_minmer ( ) {
310
+ use needletail:: sequence:: minimizer;
311
+ use rust_seq2kminmers:: KminmersIterator ;
312
+
313
+ let seq = b"AACTGCACTGCACTGCACTGCACACTGCACTGCACTGCACTGCACACTGCACTGCACTGACTGCACTGCACTGCACTGCACTGCCTGC" ;
314
+
315
+ let iter = KminmersIterator :: new ( seq, 4 , 7 , 0.1 , false ) . unwrap ( ) ;
316
+ for kminmer in iter {
317
+ println ! ( "kminmer: {:?}" , kminmer) ;
318
+ println ! ( "{:?}" , kminmer. mers( ) [ 0 ] ) ;
319
+ }
320
+
321
+ // let mz = minimizer(seq, 5);
322
+ // println!("{:?}", mz);
323
+ }
324
+
325
+ #[ test]
326
+ fn test_minimizer_strobemer ( ) {
327
+ use needletail:: { bitkmer, parse_fastx_file, Sequence } ;
328
+ use rust_seq2kminmers:: KminmersIterator ;
329
+ use std:: collections:: HashSet ;
330
+ use std:: time;
331
+
332
+ fn Jaccard_to_ANI ( J : f32 , k : usize ) -> f32 {
333
+ let ani: f32 = 1.0 + ( 2.0 / ( 1.0 / J + 1.0 ) ) . ln ( ) / ( k as f32 ) ;
334
+ ani
335
+ }
336
+
337
+ // use minimizer for sampling
338
+ let file_query = "./test/Escherichia_coli_0_1288_GCA_000303255.LargeContigs.fna" ;
339
+ // let file_ref = "./test/Escherichia_coli_2871950_GCA_000355455.LargeContigs.fna";
340
+ let file_ref = "./test/Escherichia_coli_HVH_217__4_1022806__GCA_000459375.LargeContigs.fna" ;
341
+ let mut sketch_vecs: Vec < HashSet < u64 > > = vec ! [ HashSet :: <u64 >:: default ( ) ; 2 ] ;
342
+
343
+ let w = 20 ;
344
+ let k = 21 ;
345
+ let density = 0.01 ;
346
+
347
+ for ( i, file) in [ file_query, file_ref] . iter ( ) . enumerate ( ) {
348
+ let mut fastx_reader = parse_fastx_file ( & file) . expect ( "Opening .fna files failed" ) ;
349
+
350
+ while let Some ( record) = fastx_reader. next ( ) {
351
+ let seq = record. expect ( "invalid record" ) ;
352
+ let seq = seq. normalize ( false ) ;
353
+
354
+ let iter = KminmersIterator :: new ( seq. sequence ( ) , k, w, density, false ) . unwrap ( ) ;
355
+ for kminmer in iter {
356
+ sketch_vecs[ i] . insert ( kminmer. mers ( ) [ 0 ] ) ;
357
+ // println!("kminmer: {:?}", kminmer);
358
+ // println!("{:?}", kminmer.mers()[0]);
359
+ }
360
+ // println!("{:?}", String::from_utf8(seq.seq().to_vec()));
361
+ // for i in seq.bit_kmers(5, true) {
362
+ // println!("{:?}\t{:?}", i, 0);
363
+ // }
364
+ // return;
365
+ }
366
+ println ! ( "{:?}" , sketch_vecs[ i] . len( ) ) ;
367
+ }
368
+
369
+ let i = sketch_vecs[ 0 ] . intersection ( & sketch_vecs[ 1 ] ) . count ( ) ;
370
+ let u = sketch_vecs[ 0 ] . union ( & sketch_vecs[ 1 ] ) . count ( ) ;
371
+ let J = ( i as f32 ) / u as f32 ;
372
+ let ANI = Jaccard_to_ANI ( J , k) ;
373
+
374
+ println ! ( "Minmer: J = {}\t ANI = {}" , J , ANI ) ;
375
+
376
+ //
377
+ let scaled = 2000 ;
378
+ let mut sketch_vecs: Vec < HashSet < u64 > > = vec ! [ HashSet :: <u64 >:: default ( ) ; 2 ] ;
379
+
380
+ let now = time:: Instant :: now ( ) ;
381
+ for ( i, file) in [ file_query, file_ref] . iter ( ) . enumerate ( ) {
382
+ let mut fastx_reader = parse_fastx_file ( & file) . expect ( "Opening .fna files failed" ) ;
383
+
384
+ while let Some ( record) = fastx_reader. next ( ) {
385
+ let seq = record. expect ( "invalid record" ) ;
386
+ let norm_seq = seq. normalize ( false ) ;
387
+
388
+ // let rc = norm_seq.reverse_complement();
389
+ // for (_, kmer, _) in norm_seq.canonical_kmers(k as u8, &rc) {
390
+ // let h = xxh3_64_with_seed(kmer, 1);
391
+ // if h < u64::MAX / scaled {
392
+ // sketch_vecs[i].insert(h);
393
+ // }
394
+ // }
395
+
396
+ for kmer in norm_seq. kmers ( k as u8 ) {
397
+ let h = xxh3_64_with_seed ( kmer, 1 ) ;
398
+ if h < u64:: MAX / scaled {
399
+ sketch_vecs[ i] . insert ( h) ;
400
+ }
401
+ }
402
+
403
+ // for (_, (kmer_u64, _), _) in norm_seq.bit_kmers(k as u8, true) {
404
+ // let h = xxh3_64_with_seed(&kmer_u64.to_be_bytes(), 1);
405
+ // if h < u64::MAX / scaled {
406
+ // sketch_vecs[i].insert(h);
407
+ // }
408
+ // }
409
+ }
410
+ println ! ( "{:?}" , sketch_vecs[ i] . len( ) ) ;
411
+ }
412
+
413
+ let elapsed = now. elapsed ( ) ;
414
+
415
+ let i = sketch_vecs[ 0 ] . intersection ( & sketch_vecs[ 1 ] ) . count ( ) ;
416
+ let u = sketch_vecs[ 0 ] . union ( & sketch_vecs[ 1 ] ) . count ( ) ;
417
+ let J = ( i as f32 ) / u as f32 ;
418
+ let ANI = Jaccard_to_ANI ( J , k) ;
419
+
420
+ println ! (
421
+ "kmer: J = {}\t ANI = {}, elapsed time = {:?}" ,
422
+ J , ANI , elapsed
423
+ ) ;
424
+ // use strobemer for sampling
425
+ }
268
426
}
0 commit comments