Skip to content

Commit 141e36c

Browse files
committed
Test minmer
1 parent ec3b6da commit 141e36c

File tree

1 file changed

+159
-1
lines changed

1 file changed

+159
-1
lines changed

src/lib.rs

+159-1
Original file line numberDiff line numberDiff line change
@@ -7,12 +7,17 @@ pub mod utils;
77

88
#[cfg(test)]
99
mod tests {
10+
use std::collections::HashSet;
11+
use std::default;
1012
use std::hash::Hasher;
1113

1214
use std::time::Instant;
1315

16+
use needletail::Sequence;
1417
use rand_xoshiro::Xoroshiro64Star;
18+
use rayon::iter::Enumerate;
1519
use wyhash::{wyrng, WyRng};
20+
use xxhash_rust::xxh3::xxh3_64_with_seed;
1621

1722
use crate::hd;
1823
use crate::{
@@ -125,12 +130,35 @@ mod tests {
125130
while let Some(record) = fastx_reader.next() {
126131
let seq = record.expect("invalid record");
127132

128-
for i in seq.bit_kmers(17, true) {
133+
println!("{:?}", String::from_utf8(seq.seq().to_vec()));
134+
for i in seq.bit_kmers(5, true) {
129135
println!("{:?}\t{:?}", i, 0);
130136
}
131137
}
132138
}
133139

140+
#[test]
141+
fn test_fast_xxhash() {
142+
use needletail::parse_fastx_file;
143+
use xxhash_rust::xxh3::xxh3_64;
144+
145+
let file = "./test/test.fna";
146+
let mut fastx_reader = parse_fastx_file(&file).expect("Opening .fna files failed");
147+
148+
while let Some(record) = fastx_reader.next() {
149+
let seq = record.expect("invalid record");
150+
let norm_seq = seq.normalize(false);
151+
152+
let rc = norm_seq.reverse_complement();
153+
154+
println!("{:?}", String::from_utf8(norm_seq.to_vec()));
155+
for (_, kmer, _) in norm_seq.canonical_kmers(5, &rc) {
156+
let h = xxh3_64(kmer);
157+
println!("{:?}\t{:?}", String::from_utf8(kmer.to_vec()), h);
158+
}
159+
}
160+
}
161+
134162
#[test]
135163
fn test_vec_norm() {
136164
let a = vec![1, 2, 3, -4];
@@ -265,4 +293,134 @@ mod tests {
265293
);
266294
assert_eq!(&hv, &hv_decompressed);
267295
}
296+
297+
#[test]
298+
fn test_xxhash_rng() {
299+
let mut rng = xxhash_rust::xxh3::Xxh3::with_seed(1);
300+
301+
for i in 0..4 {
302+
let h = rng.digest();
303+
rng.write_u64(h);
304+
println!("{}", h);
305+
}
306+
}
307+
308+
#[test]
309+
fn test_minmer() {
310+
use needletail::sequence::minimizer;
311+
use rust_seq2kminmers::KminmersIterator;
312+
313+
let seq = b"AACTGCACTGCACTGCACTGCACACTGCACTGCACTGCACTGCACACTGCACTGCACTGACTGCACTGCACTGCACTGCACTGCCTGC";
314+
315+
let iter = KminmersIterator::new(seq, 4, 7, 0.1, false).unwrap();
316+
for kminmer in iter {
317+
println!("kminmer: {:?}", kminmer);
318+
println!("{:?}", kminmer.mers()[0]);
319+
}
320+
321+
// let mz = minimizer(seq, 5);
322+
// println!("{:?}", mz);
323+
}
324+
325+
#[test]
326+
fn test_minimizer_strobemer() {
327+
use needletail::{bitkmer, parse_fastx_file, Sequence};
328+
use rust_seq2kminmers::KminmersIterator;
329+
use std::collections::HashSet;
330+
use std::time;
331+
332+
fn Jaccard_to_ANI(J: f32, k: usize) -> f32 {
333+
let ani: f32 = 1.0 + (2.0 / (1.0 / J + 1.0)).ln() / (k as f32);
334+
ani
335+
}
336+
337+
// use minimizer for sampling
338+
let file_query = "./test/Escherichia_coli_0_1288_GCA_000303255.LargeContigs.fna";
339+
// let file_ref = "./test/Escherichia_coli_2871950_GCA_000355455.LargeContigs.fna";
340+
let file_ref = "./test/Escherichia_coli_HVH_217__4_1022806__GCA_000459375.LargeContigs.fna";
341+
let mut sketch_vecs: Vec<HashSet<u64>> = vec![HashSet::<u64>::default(); 2];
342+
343+
let w = 20;
344+
let k = 21;
345+
let density = 0.01;
346+
347+
for (i, file) in [file_query, file_ref].iter().enumerate() {
348+
let mut fastx_reader = parse_fastx_file(&file).expect("Opening .fna files failed");
349+
350+
while let Some(record) = fastx_reader.next() {
351+
let seq = record.expect("invalid record");
352+
let seq = seq.normalize(false);
353+
354+
let iter = KminmersIterator::new(seq.sequence(), k, w, density, false).unwrap();
355+
for kminmer in iter {
356+
sketch_vecs[i].insert(kminmer.mers()[0]);
357+
// println!("kminmer: {:?}", kminmer);
358+
// println!("{:?}", kminmer.mers()[0]);
359+
}
360+
// println!("{:?}", String::from_utf8(seq.seq().to_vec()));
361+
// for i in seq.bit_kmers(5, true) {
362+
// println!("{:?}\t{:?}", i, 0);
363+
// }
364+
// return;
365+
}
366+
println!("{:?}", sketch_vecs[i].len());
367+
}
368+
369+
let i = sketch_vecs[0].intersection(&sketch_vecs[1]).count();
370+
let u = sketch_vecs[0].union(&sketch_vecs[1]).count();
371+
let J = (i as f32) / u as f32;
372+
let ANI = Jaccard_to_ANI(J, k);
373+
374+
println!("Minmer: J = {}\t ANI = {}", J, ANI);
375+
376+
//
377+
let scaled = 2000;
378+
let mut sketch_vecs: Vec<HashSet<u64>> = vec![HashSet::<u64>::default(); 2];
379+
380+
let now = time::Instant::now();
381+
for (i, file) in [file_query, file_ref].iter().enumerate() {
382+
let mut fastx_reader = parse_fastx_file(&file).expect("Opening .fna files failed");
383+
384+
while let Some(record) = fastx_reader.next() {
385+
let seq = record.expect("invalid record");
386+
let norm_seq = seq.normalize(false);
387+
388+
// let rc = norm_seq.reverse_complement();
389+
// for (_, kmer, _) in norm_seq.canonical_kmers(k as u8, &rc) {
390+
// let h = xxh3_64_with_seed(kmer, 1);
391+
// if h < u64::MAX / scaled {
392+
// sketch_vecs[i].insert(h);
393+
// }
394+
// }
395+
396+
for kmer in norm_seq.kmers(k as u8) {
397+
let h = xxh3_64_with_seed(kmer, 1);
398+
if h < u64::MAX / scaled {
399+
sketch_vecs[i].insert(h);
400+
}
401+
}
402+
403+
// for (_, (kmer_u64, _), _) in norm_seq.bit_kmers(k as u8, true) {
404+
// let h = xxh3_64_with_seed(&kmer_u64.to_be_bytes(), 1);
405+
// if h < u64::MAX / scaled {
406+
// sketch_vecs[i].insert(h);
407+
// }
408+
// }
409+
}
410+
println!("{:?}", sketch_vecs[i].len());
411+
}
412+
413+
let elapsed = now.elapsed();
414+
415+
let i = sketch_vecs[0].intersection(&sketch_vecs[1]).count();
416+
let u = sketch_vecs[0].union(&sketch_vecs[1]).count();
417+
let J = (i as f32) / u as f32;
418+
let ANI = Jaccard_to_ANI(J, k);
419+
420+
println!(
421+
"kmer: J = {}\t ANI = {}, elapsed time = {:?}",
422+
J, ANI, elapsed
423+
);
424+
// use strobemer for sampling
425+
}
268426
}

0 commit comments

Comments
 (0)