diff --git a/Cargo.toml b/Cargo.toml index 264b86c..66a548d 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "bqtools" -version = "0.5.4" +version = "0.5.5" edition = "2021" license = "MIT" authors = ["Noam Teyssier "] @@ -27,9 +27,9 @@ nix = { version = "0.31.1", features = ["fs"] } num_cpus = "1.17.0" paraseq = "0.4.8" parking_lot = "0.12.5" -rand = "0.9.2" +rand = "0.10.0" regex = "1.12.2" -sassy = { version = "0.1.10", optional = true } +sassy = { version = "0.2.1", optional = true } serde = { version = "1.0.228", features = ["derive"] } walkdir = "2.5.0" zstd = { version = "0.13.3", features = ["zstdmt"] } @@ -37,7 +37,6 @@ zstd = { version = "0.13.3", features = ["zstdmt"] } [dev-dependencies] bon = "3.8.2" itertools = "0.14.0" -nucgen = "0.2.0" tempfile = "3.24.0" [features] diff --git a/README.md b/README.md index 7eb2c7d..c660871 100644 --- a/README.md +++ b/README.md @@ -209,13 +209,13 @@ It will then balance the provided file/file pairs among the thread pool to ensur All options provided by `bqtools encode` will be passed through to the sub-encoders. ```bash -# Encode all FASTX files as BQ -bqtools encode --recursive --mode bq ./ +# Encode all FASTX files as CBQ +bqtools encode --recursive --mode cbq ./ -# Encode all paired FASTX files as VBQ and index their output -bqtools encode --recursive --paired --mode vbq --index ./ +# Encode all paired FASTX files as VBQ +bqtools encode --recursive --paired --mode vbq ./ -# Encode recursively with a max-subdirectory depth of 2 +# Encode as BQ recursively with a max-subdirectory depth of 2 bqtools encode --recursive --mode bq --depth 2 ./ ``` diff --git a/src/commands/grep/filter/fuzzy_matcher.rs b/src/commands/grep/filter/fuzzy_matcher.rs index d2049f3..316fa4c 100644 --- a/src/commands/grep/filter/fuzzy_matcher.rs +++ b/src/commands/grep/filter/fuzzy_matcher.rs @@ -1,17 +1,41 @@ use super::{MatchRanges, PatternMatch}; -use sassy::{profiles::Iupac, Searcher}; +use anyhow::Result; +use fixedbitset::FixedBitSet; +use log::error; +use sassy::{profiles::Iupac, EncodedPatterns, Searcher}; +type Profile = Iupac; type Patterns = Vec>; #[derive(Clone)] pub struct FuzzyMatcher { - pat1: Patterns, - pat2: Patterns, - pat: Patterns, - k: usize, // maximum edit distance to accept - inexact: bool, // whether to only report inexact matches - offset: usize, // Left-offset relevant for range matching + /// Encoded patterns for the first pattern collection + pat1: Option>, + /// Encoded patterns for the second pattern collection + pat2: Option>, + /// Encoded patterns for the shared pattern collection + pat: Option>, + + /// Maximum edit distance to accept + k: usize, + /// Whether to only report inexact matches + inexact: bool, + /// Left-offset relevant for range matching + offset: usize, + + /// Fixed-bitset for pat1 + bs1: FixedBitSet, + /// Fixed-bitset for pat2 + bs2: FixedBitSet, + /// Fixed-bitset for pat + bs: FixedBitSet, + + /// Primary sequence pattern searcher + searcher_1: Searcher, + /// Secondary sequence pattern searcher + searcher_2: Searcher, + /// Shared sequence pattern searcher searcher: Searcher, } @@ -23,36 +47,66 @@ impl FuzzyMatcher { k: usize, inexact: bool, offset: usize, - ) -> Self { - Self { - pat1, - pat2, - pat, + ) -> Result { + // initialize a searcher for each pattern collection + let mut searcher_1 = Searcher::new_fwd(); + let mut searcher_2 = Searcher::new_fwd(); + let mut searcher = Searcher::new_fwd(); + + // validate pattern lengths + validate_single_pattern_length(&pat1)?; + validate_single_pattern_length(&pat2)?; + validate_single_pattern_length(&pat)?; + + // encode the patterns for each collection/searcher combination + let enc_pat1 = (!pat1.is_empty()).then(|| searcher_1.encode_patterns(&pat1)); + let enc_pat2 = (!pat2.is_empty()).then(|| searcher_2.encode_patterns(&pat2)); + let enc_pat = (!pat.is_empty()).then(|| searcher.encode_patterns(&pat)); + + // initialize fixed-bitsets for each pattern collection + let bs1 = FixedBitSet::with_capacity(pat1.len()); + let bs2 = FixedBitSet::with_capacity(pat2.len()); + let bs = FixedBitSet::with_capacity(pat.len()); + + Ok(Self { + pat1: enc_pat1, + pat2: enc_pat2, + pat: enc_pat, k, inexact, offset, - searcher: Searcher::new_fwd(), - } + bs1, + bs2, + bs, + searcher_1, + searcher_2, + searcher, + }) } } fn find_and_insert_matches( - pat: &[u8], + patterns: &EncodedPatterns, sequence: &[u8], matches: &mut MatchRanges, searcher: &mut Searcher, + bitset: &mut FixedBitSet, k: usize, inexact: bool, offset: usize, ) -> bool { let mut found = false; - for mat in searcher.search(pat, &sequence, k) { - if inexact && mat.cost == 0 { - continue; - } - matches.insert((mat.text_start + offset, mat.text_end + offset)); - found = true; - } + searcher + .search_encoded_patterns(patterns, sequence, k) + .iter() + .for_each(|m| { + if inexact && m.cost == 0 { + return; + } + matches.insert((m.text_start + offset, m.text_end + offset)); + bitset.set(m.pattern_idx, true); + found = true; + }); found } @@ -67,25 +121,26 @@ impl PatternMatch for FuzzyMatcher { matches: &mut MatchRanges, and_logic: bool, ) -> bool { - if self.pat1.is_empty() { - return true; - } - let offset = self.offset(); - let closure = |pat: &Vec| { - find_and_insert_matches( - pat, + if let Some(ref epat) = self.pat1 { + self.bs1.clear(); + let offset = self.offset(); + let has_any_match = find_and_insert_matches( + epat, sequence, matches, - &mut self.searcher, + &mut self.searcher_1, + &mut self.bs1, self.k, self.inexact, offset, - ) - }; - if and_logic { - self.pat1.iter().all(closure) + ); + if and_logic { + has_any_match && self.bs1.is_full() + } else { + has_any_match + } } else { - self.pat1.iter().any(closure) + true } } @@ -95,25 +150,26 @@ impl PatternMatch for FuzzyMatcher { matches: &mut MatchRanges, and_logic: bool, ) -> bool { - if self.pat2.is_empty() || sequence.is_empty() { - return true; - } - let offset = self.offset(); - let closure = |pat: &Vec| { - find_and_insert_matches( - pat, + if let Some(ref epat) = self.pat2 { + self.bs2.clear(); + let offset = self.offset(); + let has_any_match = find_and_insert_matches( + epat, sequence, matches, - &mut self.searcher, + &mut self.searcher_2, + &mut self.bs2, self.k, self.inexact, offset, - ) - }; - if and_logic { - self.pat2.iter().all(closure) + ); + if and_logic { + has_any_match && self.bs2.is_full() + } else { + has_any_match + } } else { - self.pat2.iter().any(closure) + true } } @@ -125,35 +181,52 @@ impl PatternMatch for FuzzyMatcher { xmatches: &mut MatchRanges, and_logic: bool, ) -> bool { - if self.pat.is_empty() { - return true; - } - let offset = self.offset(); - let closure = |pat: &Vec| { - let found_s = find_and_insert_matches( - pat, + if let Some(ref epat) = self.pat { + self.bs.clear(); + let offset = self.offset(); + let primary_has_any_match = find_and_insert_matches( + epat, primary, smatches, &mut self.searcher, + &mut self.bs, self.k, self.inexact, offset, ); - let found_x = find_and_insert_matches( - pat, + let secondary_has_any_match = find_and_insert_matches( + epat, secondary, xmatches, &mut self.searcher, + &mut self.bs, self.k, self.inexact, offset, ); - found_s || found_x // OR here because we want to match either primary or secondary - }; - if and_logic { - self.pat.iter().all(closure) + let has_any_match = primary_has_any_match || secondary_has_any_match; + if and_logic { + has_any_match && self.bs.is_full() + } else { + has_any_match + } } else { - self.pat.iter().any(closure) + true } } } + +fn validate_single_pattern_length(patterns: &Patterns) -> Result<()> { + if patterns.len() < 2 { + return Ok(()); + } + let plen = patterns[0].len(); + for pattern in patterns { + if pattern.len() != plen { + error!("Multiple pattern lengths provided - currently cannot handle variable-length patterns in fuzzy matching"); + return Err(anyhow::anyhow!("Pattern length mismatch")); + } + } + + Ok(()) +} diff --git a/src/commands/grep/filter/mod.rs b/src/commands/grep/filter/mod.rs index b4cbcf8..6cace78 100644 --- a/src/commands/grep/filter/mod.rs +++ b/src/commands/grep/filter/mod.rs @@ -332,7 +332,7 @@ mod matcher_unit_tests { #[test] fn test_fuzzy_matcher_basic() { let pat1 = vec![b"AAAAAAAA".to_vec()]; - let mut matcher = FuzzyMatcher::new(pat1, vec![], vec![], 1, false, 0); + let mut matcher = FuzzyMatcher::new(pat1, vec![], vec![], 1, false, 0).unwrap(); // Exact match let seq_exact = b"GGGGAAAAAAAATTTT"; @@ -357,7 +357,7 @@ mod matcher_unit_tests { let pat1 = vec![b"AAAAAAAA".to_vec()]; // Test with k=0 (exact match only) - let mut matcher_k0 = FuzzyMatcher::new(pat1.clone(), vec![], vec![], 0, false, 0); + let mut matcher_k0 = FuzzyMatcher::new(pat1.clone(), vec![], vec![], 0, false, 0).unwrap(); let seq_exact = b"GGGGAAAAAAAATTTT"; let seq_mismatch = b"GGGGAAAAACAATTTT"; let mut matches1 = HashSet::new(); @@ -367,7 +367,7 @@ mod matcher_unit_tests { assert!(!matcher_k0.match_primary(seq_mismatch, &mut matches2, true)); // Test with k=2 (up to 2 edits) - let mut matcher_k2 = FuzzyMatcher::new(pat1, vec![], vec![], 2, false, 0); + let mut matcher_k2 = FuzzyMatcher::new(pat1, vec![], vec![], 2, false, 0).unwrap(); let mut matches3 = HashSet::new(); assert!(matcher_k2.match_primary(seq_mismatch, &mut matches3, true)); @@ -377,7 +377,7 @@ mod matcher_unit_tests { #[test] fn test_fuzzy_matcher_inexact_only() { let pat1 = vec![b"AAAAAAAA".to_vec()]; - let mut matcher = FuzzyMatcher::new(pat1, vec![], vec![], 2, true, 0); + let mut matcher = FuzzyMatcher::new(pat1, vec![], vec![], 2, true, 0).unwrap(); // Exact match should not be reported with inexact_only let seq_exact = b"GGGGAAAAAAAATTTT"; @@ -394,7 +394,7 @@ mod matcher_unit_tests { #[test] fn test_fuzzy_matcher_secondary() { let pat2 = vec![b"TTTTTTTT".to_vec()]; - let mut matcher = FuzzyMatcher::new(vec![], pat2, vec![], 1, false, 0); + let mut matcher = FuzzyMatcher::new(vec![], pat2, vec![], 1, false, 0).unwrap(); let sequence = b"GGGGTTTTTTTTCCCC"; let mut matches = HashSet::new(); @@ -406,7 +406,7 @@ mod matcher_unit_tests { #[test] fn test_fuzzy_matcher_either() { let pat = vec![b"CCCCCCCC".to_vec()]; - let mut matcher = FuzzyMatcher::new(vec![], vec![], pat, 1, false, 0); + let mut matcher = FuzzyMatcher::new(vec![], vec![], pat, 1, false, 0).unwrap(); let primary = b"GGGGAAAATTTT"; let secondary = b"GGGGCCCCCCCCTTTT"; @@ -424,7 +424,7 @@ mod matcher_unit_tests { #[test] fn test_fuzzy_matcher_and_logic() { let pat1 = vec![b"AAAAAAAA".to_vec(), b"TTTTTTTT".to_vec()]; - let mut matcher = FuzzyMatcher::new(pat1, vec![], vec![], 1, false, 0); + let mut matcher = FuzzyMatcher::new(pat1, vec![], vec![], 1, false, 0).unwrap(); // Sequence with both patterns let seq_both = b"AAAAAAAATTTTTTTT"; @@ -441,7 +441,7 @@ mod matcher_unit_tests { #[test] fn test_fuzzy_matcher_or_logic() { let pat1 = vec![b"AAAAAAAA".to_vec(), b"TTTTTTTT".to_vec()]; - let mut matcher = FuzzyMatcher::new(pat1, vec![], vec![], 1, false, 0); + let mut matcher = FuzzyMatcher::new(pat1, vec![], vec![], 1, false, 0).unwrap(); // Sequence with only one pattern let seq = b"AAAAAAAACCCCCCCC"; @@ -452,7 +452,7 @@ mod matcher_unit_tests { #[cfg(feature = "fuzzy")] #[test] fn test_fuzzy_matcher_empty_patterns() { - let mut matcher = FuzzyMatcher::new(vec![], vec![], vec![], 1, false, 0); + let mut matcher = FuzzyMatcher::new(vec![], vec![], vec![], 1, false, 0).unwrap(); let sequence = b"GGGGAAAATTTT"; let mut matches = HashSet::new(); @@ -617,7 +617,7 @@ mod matcher_unit_tests { #[test] fn test_fuzzy_matcher_offset_zero() { let pat1 = vec![b"AAAA".to_vec()]; - let mut matcher = FuzzyMatcher::new(pat1, vec![], vec![], 0, false, 0); + let mut matcher = FuzzyMatcher::new(pat1, vec![], vec![], 0, false, 0).unwrap(); let sequence = b"GGGGAAAAATTTT"; let mut matches = HashSet::new(); @@ -636,7 +636,8 @@ mod matcher_unit_tests { fn test_fuzzy_matcher_offset_nonzero() { let offset = 15; let pat1 = vec![b"AAAA".to_vec()]; - let mut matcher = FuzzyMatcher::new(pat1.clone(), vec![], vec![], 1, false, offset); + let mut matcher = + FuzzyMatcher::new(pat1.clone(), vec![], vec![], 1, false, offset).unwrap(); let sequence = b"GGGGAAAAATTTT"; let mut matches = HashSet::new(); @@ -647,7 +648,8 @@ mod matcher_unit_tests { assert!(!matches.is_empty(), "Should find at least one match"); // Create a matcher with offset=0 to get the baseline positions - let mut baseline_matcher = FuzzyMatcher::new(pat1.clone(), vec![], vec![], 1, false, 0); + let mut baseline_matcher = + FuzzyMatcher::new(pat1.clone(), vec![], vec![], 1, false, 0).unwrap(); let mut baseline_matches = HashSet::new(); baseline_matcher.match_primary(sequence, &mut baseline_matches, true); let baseline_match = baseline_matches.iter().next().unwrap(); @@ -681,7 +683,7 @@ mod matcher_unit_tests { fn test_fuzzy_matcher_offset_with_mismatch() { let offset = 8; let pat1 = vec![b"AAAA".to_vec()]; - let mut matcher = FuzzyMatcher::new(pat1, vec![], vec![], 1, false, offset); + let mut matcher = FuzzyMatcher::new(pat1, vec![], vec![], 1, false, offset).unwrap(); // One mismatch in the pattern let sequence = b"GGGGAACAATTTT"; @@ -704,7 +706,7 @@ mod matcher_unit_tests { fn test_fuzzy_matcher_offset_secondary() { let offset = 12; let pat2 = vec![b"TTTT".to_vec()]; - let mut matcher = FuzzyMatcher::new(vec![], pat2, vec![], 1, false, offset); + let mut matcher = FuzzyMatcher::new(vec![], pat2, vec![], 1, false, offset).unwrap(); let sequence = b"GGGGTTTTCCCC"; let mut matches = HashSet::new(); diff --git a/src/commands/grep/mod.rs b/src/commands/grep/mod.rs index 6098a22..83c4a9f 100644 --- a/src/commands/grep/mod.rs +++ b/src/commands/grep/mod.rs @@ -6,7 +6,7 @@ mod range; #[cfg(feature = "fuzzy")] use filter::FuzzyMatcher; -use log::warn; +use log::{error, warn}; #[cfg(feature = "fuzzy")] use pattern_count::FuzzyPatternCounter; @@ -24,7 +24,7 @@ use crate::{ commands::{decode::SplitWriter, grep::filter::AhoCorasickMatcher}, }; -use anyhow::Result; +use anyhow::{bail, Result}; use binseq::prelude::*; /// Returns true if the pattern is a fixed DNA string (only ACGT). @@ -43,16 +43,66 @@ fn all_patterns_fixed(pattern_sets: &[&PatternCollection]) -> bool { .all(|p| is_fixed(&p.sequence)) } +/// Handles pattern mates by clearing and ingesting patterns into the appropriate collections. +/// +/// This is relevant when using the `--mate` option, and enforces that no matches can occur on an ignored mate. +fn redistribute_patterns( + pat1: &mut PatternCollection, + pat2: &mut PatternCollection, + pat: &mut PatternCollection, + mate: Mate, +) -> Result<()> { + match mate { + Mate::Both => { + // Do nothing - both mates are used + } + Mate::One => { + pat2.clear(); // remove existing patterns from mate 2 + pat1.ingest(pat); // take patterns from both and apply only to mate 1 + if pat1.is_empty() { + error!("No patterns provided for mate 1"); + bail!("No patterns provided for mate 1"); + } + } + Mate::Two => { + pat1.clear(); // remove existing patterns from mate 1 + pat2.ingest(pat); // take patterns from both and apply only to mate 2 + if pat2.is_empty() { + error!("No patterns provided for mate 2"); + bail!("No patterns provided for mate 2"); + } + } + } + Ok(()) +} + +fn load_patterns(args: &GrepCommand) -> Result { + let mut pat1 = args.grep.patterns_m1()?; + let mut pat2 = args.grep.patterns_m2()?; + let mut pat = args.grep.patterns()?; + redistribute_patterns(&mut pat1, &mut pat2, &mut pat, args.output.mate)?; + Ok(AllPatterns { pat1, pat2, pat }) +} + +struct AllPatterns { + pat1: PatternCollection, + pat2: PatternCollection, + pat: PatternCollection, +} +impl AllPatterns { + pub fn are_fixed(&self) -> bool { + all_patterns_fixed(&[&self.pat1, &self.pat2, &self.pat]) + } +} + fn build_counter(args: &GrepCommand) -> Result { #[cfg(feature = "fuzzy")] if args.grep.fuzzy_args.fuzzy { - let pat1 = args.grep.patterns_m1()?; - let pat2 = args.grep.patterns_m2()?; - let pat = args.grep.patterns()?; + let patterns = load_patterns(args)?; let counter = FuzzyPatternCounter::new( - pat1, - pat2, - pat, + patterns.pat1, + patterns.pat2, + patterns.pat, args.grep.fuzzy_args.distance, args.grep.fuzzy_args.inexact, args.grep.invert, @@ -60,20 +110,24 @@ fn build_counter(args: &GrepCommand) -> Result { return Ok(PatternCounter::Fuzzy(counter)); } - let pat1 = args.grep.patterns_m1()?; - let pat2 = args.grep.patterns_m2()?; - let pat = args.grep.patterns()?; - let use_fixed = args.grep.fixed || all_patterns_fixed(&[&pat1, &pat2, &pat]); + let patterns = load_patterns(args)?; + let use_fixed = args.grep.fixed || patterns.are_fixed(); if !args.grep.fixed && use_fixed { log::debug!("All patterns are fixed strings — auto-selecting Aho-Corasick"); } if use_fixed { - let counter = - AhoCorasickPatternCounter::new(pat1, pat2, pat, args.grep.no_dfa, args.grep.invert)?; + let counter = AhoCorasickPatternCounter::new( + patterns.pat1, + patterns.pat2, + patterns.pat, + args.grep.no_dfa, + args.grep.invert, + )?; Ok(PatternCounter::AhoCorasick(counter)) } else { - let counter = RegexPatternCounter::new(pat1, pat2, pat, args.grep.invert)?; + let counter = + RegexPatternCounter::new(patterns.pat1, patterns.pat2, patterns.pat, args.grep.invert)?; Ok(PatternCounter::Regex(counter)) } } @@ -99,33 +153,29 @@ fn run_pattern_count(args: &GrepCommand, reader: BinseqReader) -> Result<()> { fn build_matcher(args: &GrepCommand) -> Result { #[cfg(feature = "fuzzy")] if args.grep.fuzzy_args.fuzzy { - let pat1 = args.grep.patterns_m1()?; - let pat2 = args.grep.patterns_m2()?; - let pat = args.grep.patterns()?; + let patterns = load_patterns(args)?; let matcher = FuzzyMatcher::new( - pat1.bytes(), - pat2.bytes(), - pat.bytes(), + patterns.pat1.bytes(), + patterns.pat2.bytes(), + patterns.pat.bytes(), args.grep.fuzzy_args.distance, args.grep.fuzzy_args.inexact, args.grep.range.map_or(0, |r| r.offset()), - ); + )?; return Ok(PatternMatcher::Fuzzy(matcher)); } - let pat1 = args.grep.patterns_m1()?; - let pat2 = args.grep.patterns_m2()?; - let pat = args.grep.patterns()?; - let use_fixed = args.grep.fixed || all_patterns_fixed(&[&pat1, &pat2, &pat]); + let patterns = load_patterns(args)?; + let use_fixed = args.grep.fixed || patterns.are_fixed(); if !args.grep.fixed && use_fixed { log::debug!("All patterns are fixed strings — auto-selecting Aho-Corasick"); } if use_fixed && !args.grep.and_logic() { let matcher = AhoCorasickMatcher::new( - pat1.bytes(), - pat2.bytes(), - pat.bytes(), + patterns.pat1.bytes(), + patterns.pat2.bytes(), + patterns.pat.bytes(), args.grep.no_dfa, args.grep.range.map_or(0, |r| r.offset()), )?; @@ -135,9 +185,9 @@ fn build_matcher(args: &GrepCommand) -> Result { warn!("`-x/--fixed provided but ignored when using AND logic"); } let matcher = RegexMatcher::new( - pat1.regexes()?, - pat2.regexes()?, - pat.regexes()?, + patterns.pat1.regexes()?, + patterns.pat2.regexes()?, + patterns.pat.regexes()?, args.grep.range.map_or(0, |r| r.offset()), ); Ok(PatternMatcher::Regex(matcher)) @@ -203,7 +253,9 @@ pub fn run(args: &GrepCommand) -> Result<()> { #[cfg(test)] mod fixed_detection_tests { - use super::{all_patterns_fixed, is_fixed, Pattern, PatternCollection}; + use crate::commands::grep::redistribute_patterns; + + use super::{all_patterns_fixed, is_fixed, Mate, Pattern, PatternCollection}; fn pc(patterns: &[&[u8]]) -> PatternCollection { PatternCollection( @@ -271,4 +323,47 @@ mod fixed_detection_tests { let p2 = pc(&[]); assert!(all_patterns_fixed(&[&p1, &p2])); } + + #[test] + fn test_redistribution_noop() { + let mut pat1 = pc(&[b"ACGT", b"TTTT"]); + let mut pat2 = pc(&[b"GGGG"]); + let mut pat = pc(&[b"AC.GT"]); + + let pat1_clone = pat1.clone(); + let pat2_clone = pat2.clone(); + let pat_clone = pat.clone(); + + redistribute_patterns(&mut pat1, &mut pat2, &mut pat, Mate::Both).unwrap(); + + assert_eq!(pat1, pat1_clone); + assert_eq!(pat2, pat2_clone); + assert_eq!(pat, pat_clone); + } + + #[test] + fn test_redistribution_m1() { + let mut pat1 = pc(&[b"ACGT", b"TTTT"]); + let mut pat2 = pc(&[b"GGGG"]); + let mut pat = pc(&[b"AC.GT"]); + + redistribute_patterns(&mut pat1, &mut pat2, &mut pat, Mate::One).unwrap(); + + assert_eq!(pat1, pc(&[b"ACGT", b"TTTT", b"AC.GT"])); + assert!(pat2.is_empty()); + assert!(pat.is_empty()); + } + + #[test] + fn test_redistribution_m2() { + let mut pat1 = pc(&[b"ACGT", b"TTTT"]); + let mut pat2 = pc(&[b"GGGG"]); + let mut pat = pc(&[b"AC.GT"]); + + redistribute_patterns(&mut pat1, &mut pat2, &mut pat, Mate::Two).unwrap(); + + assert!(pat1.is_empty()); + assert_eq!(pat2, pc(&[b"GGGG", b"AC.GT"])); + assert!(pat.is_empty()); + } } diff --git a/src/commands/grep/pattern_count/fuzzy_pc.rs b/src/commands/grep/pattern_count/fuzzy_pc.rs index 83a4821..6b9db7d 100644 --- a/src/commands/grep/pattern_count/fuzzy_pc.rs +++ b/src/commands/grep/pattern_count/fuzzy_pc.rs @@ -1,20 +1,35 @@ use super::{PatternCollection, PatternCount}; -use sassy::{profiles::Iupac, Searcher}; +use fixedbitset::FixedBitSet; +use sassy::{profiles::Iupac, EncodedPatterns, Match, Searcher}; + +type Profile = Iupac; -type Patterns = Vec>; #[derive(Clone)] pub struct FuzzyPatternCounter { /// Patterns to fuzzy match on - pat1: Patterns, // in primary - pat2: Patterns, // in secondary - pat: Patterns, // in either - k: usize, // maximum edit distance to accept - inexact: bool, // whether to only report inexact matches - invert: bool, // invert the match + pat1: Option>, // in primary + pat2: Option>, // in secondary + pat: Option>, // in either + k: usize, // maximum edit distance to accept + inexact: bool, // whether to only report inexact matches + invert: bool, // invert the match + + /// Fixed bitset for pat1 + bits1: FixedBitSet, + /// Fixed bitset for pat2 + bits2: FixedBitSet, + /// Fixed bitset for pat + bits: FixedBitSet, all_patterns: PatternCollection, - searcher: Searcher, + + /// Primary sequence searcher + searcher_1: Searcher, + /// Extended sequence searcher + searcher_2: Searcher, + /// Combined searcher + searcher: Searcher, } impl FuzzyPatternCounter { @@ -26,82 +41,135 @@ impl FuzzyPatternCounter { inexact: bool, invert: bool, ) -> Self { - let pat1_bytes = pat1.bytes(); - let pat2_bytes = pat2.bytes(); - let pat_bytes = pat.bytes(); + // initialize a searcher for each pattern collection + let mut searcher_1 = Searcher::new_fwd(); + let mut searcher_2 = Searcher::new_fwd(); + let mut searcher = Searcher::new_fwd(); + + // encode the patterns for each collection/searcher combination + let enc_pat1 = (!pat1.is_empty()).then(|| searcher_1.encode_patterns(&pat1.bytes())); + let enc_pat2 = (!pat2.is_empty()).then(|| searcher_2.encode_patterns(&pat2.bytes())); + let enc_pat = (!pat.is_empty()).then(|| searcher.encode_patterns(&pat.bytes())); + + let bits1 = FixedBitSet::with_capacity(pat1.len()); + let bits2 = FixedBitSet::with_capacity(pat2.len()); + let bits = FixedBitSet::with_capacity(pat.len()); + + // combine all patterns into a single collection for reporting let all_patterns = PatternCollection(pat1.into_iter().chain(pat2).chain(pat).collect()); + Self { - pat1: pat1_bytes, - pat2: pat2_bytes, - pat: pat_bytes, + pat1: enc_pat1, + pat2: enc_pat2, + pat: enc_pat, k, inexact, invert, all_patterns, - searcher: Searcher::new_fwd(), + bits1, + bits2, + bits, + searcher_1, + searcher_2, + searcher, } } - fn match_primary(&mut self, sequence: &[u8], pattern_counts: &mut [usize]) { - if self.pat1.is_empty() { - return; - } - self.pat1.iter().enumerate().for_each(|(index, pat)| { - let counted = self - .searcher - .search(pat, &sequence, self.k) + fn match_primary(&mut self, sequence: &[u8]) { + if let Some(ref epat) = self.pat1 { + self.searcher_1 + .search_encoded_patterns(epat, sequence, self.k) .iter() - .any(|mat| !self.inexact || mat.cost != 0); - if counted != self.invert { - pattern_counts[index] += 1; - } - }); + .for_each(|m| { + let counted = !self.inexact || m.cost != 0; + if counted { + self.bits1.set(m.pattern_idx, true); + } + }); + } } - fn match_secondary(&mut self, sequence: &[u8], pattern_counts: &mut [usize]) { - if self.pat2.is_empty() || sequence.is_empty() { - return; - } - self.pat2.iter().enumerate().for_each(|(index, pat)| { - let counted = self - .searcher - .search(pat, &sequence, self.k) + fn match_secondary(&mut self, sequence: &[u8]) { + if let Some(ref epat) = self.pat2 { + self.searcher_2 + .search_encoded_patterns(epat, sequence, self.k) .iter() - .any(|mat| !self.inexact || mat.cost != 0); - if counted != self.invert { - pattern_counts[self.pat1.len() + index] += 1; - } - }); + .for_each(|m| { + let counted = !self.inexact || m.cost != 0; + if counted { + self.bits2.set(m.pattern_idx, true); + } + }); + } } - fn match_either(&mut self, primary: &[u8], secondary: &[u8], pattern_counts: &mut [usize]) { - if self.pat.is_empty() { - return; - } - self.pat.iter().enumerate().for_each(|(index, pat)| { - let counted = self - .searcher - .search(pat, &primary, self.k) + fn match_either(&mut self, primary: &[u8], secondary: &[u8]) { + if let Some(ref epat) = self.pat { + let mut eval = |m: &Match| { + let counted = !self.inexact || m.cost != 0; + if counted { + self.bits.set(m.pattern_idx, true); + } + }; + + // match on primary + self.searcher + .search_encoded_patterns(epat, primary, self.k) .iter() - .chain(self.searcher.search(pat, &secondary, self.k).iter()) - .any(|mat| !self.inexact || mat.cost != 0); + .for_each(|m| eval(m)); + + // match on secondary + self.searcher + .search_encoded_patterns(epat, secondary, self.k) + .iter() + .for_each(|m| eval(m)); + } + } + + fn clear_bits(&mut self) { + self.bits1.clear(); + self.bits2.clear(); + self.bits.clear(); + } - if counted != self.invert { - pattern_counts[self.pat1.len() + self.pat2.len() + index] += 1; + fn update_pattern_count(&mut self, pattern_count: &mut [usize]) { + // evaluate the bitset for each pattern type + let mut eval = |bitset: &FixedBitSet, invert: bool, offset: usize| { + if invert { + bitset.zeroes().for_each(|i| { + pattern_count[i as usize + offset] += 1; + }); + } else { + bitset.ones().for_each(|i| { + pattern_count[i as usize + offset] += 1; + }); } - }); + }; + + eval(&self.bits1, self.invert, 0); + eval(&self.bits2, self.invert, self.bits1.len()); + eval(&self.bits, self.invert, self.bits1.len() + self.bits2.len()); } } impl PatternCount for FuzzyPatternCounter { fn count_patterns(&mut self, primary: &[u8], secondary: &[u8], pattern_count: &mut [usize]) { - self.match_primary(primary, pattern_count); - self.match_secondary(secondary, pattern_count); - self.match_either(primary, secondary, pattern_count); + // remove all previous matches + self.clear_bits(); + + // match patterns + self.match_primary(primary); + self.match_secondary(secondary); + self.match_either(primary, secondary); + + // update pattern count (invert if necessary) + self.update_pattern_count(pattern_count); } fn num_patterns(&self) -> usize { - self.pat1.len() + self.pat2.len() + self.pat.len() + self.pat1.as_ref().map_or(0, |p| p.n_queries()) + + self.pat2.as_ref().map_or(0, |p| p.n_queries()) + + self.pat.as_ref().map_or(0, |p| p.n_queries()) } fn pattern_strings(&self) -> Vec { diff --git a/src/commands/grep/patterns.rs b/src/commands/grep/patterns.rs index ed41845..f9c10e3 100644 --- a/src/commands/grep/patterns.rs +++ b/src/commands/grep/patterns.rs @@ -1,7 +1,7 @@ use anyhow::Result; /// A pattern with an optional name (from FASTA headers). -#[derive(Clone)] +#[derive(Clone, Debug, PartialEq, Eq)] pub struct Pattern { pub name: Option, pub sequence: Vec, @@ -15,7 +15,7 @@ impl Pattern { } /// A collection of patterns with convenience methods for type conversions. -#[derive(Clone)] +#[derive(Clone, Debug, PartialEq, Eq)] pub struct PatternCollection(pub Vec); impl PatternCollection { pub fn bytes(&self) -> Vec> { @@ -43,7 +43,6 @@ impl PatternCollection { self.0.len() } - #[allow(dead_code)] pub fn is_empty(&self) -> bool { self.0.is_empty() } @@ -51,6 +50,21 @@ impl PatternCollection { pub fn iter(&self) -> std::slice::Iter<'_, Pattern> { self.0.iter() } + + /// Takes all patterns from `other` and moves them into this collection. + pub fn ingest(&mut self, other: &mut Self) { + self.0.extend(other.drain()); + } + + /// Drains all patterns from this collection, returning an iterator over them. + pub fn drain(&mut self) -> impl Iterator + '_ { + self.0.drain(..) + } + + /// Clears all patterns from this collection. + pub fn clear(&mut self) { + self.0.clear(); + } } impl IntoIterator for PatternCollection { type Item = Pattern; diff --git a/src/commands/sample/mod.rs b/src/commands/sample/mod.rs index 2d20bb8..362b669 100644 --- a/src/commands/sample/mod.rs +++ b/src/commands/sample/mod.rs @@ -4,7 +4,7 @@ use crate::cli::{FileFormat, Mate, SampleCommand}; use anyhow::Result; use binseq::prelude::*; use parking_lot::Mutex; -use rand::{Rng, SeedableRng}; +use rand::{RngExt, SeedableRng}; use super::decode::{build_writer, write_record_pair, SplitWriter}; diff --git a/tests/common.rs b/tests/common.rs index c83524e..78062e8 100644 --- a/tests/common.rs +++ b/tests/common.rs @@ -5,6 +5,7 @@ use anyhow::Result; use binseq::BinseqReader; use bon::builder; use niffler::Level; +use rand::{Rng, RngExt}; use tempfile::NamedTempFile; pub const COMMAND_NAME: &str = "bqtools"; @@ -104,6 +105,21 @@ fn with_suffix(format: FastxFormat, comp: CompressionStatus) -> String { format!("{}{}", format.suffix(), comp.suffix()) } +fn fill_nucleotide_buffer(buffer: &mut Vec, rng: &mut R, slen: usize) { + buffer.clear(); + buffer.reserve(slen); + for _ in 0..slen { + buffer.push(match rng.random_range(0..=4) { + 0 => b'A', + 1 => b'C', + 2 => b'G', + 3 => b'T', + 4 => b'N', + _ => unreachable!(), + }); + } +} + #[builder] pub fn write_fastx( #[builder(default)] comp: CompressionStatus, @@ -113,16 +129,14 @@ pub fn write_fastx( ) -> Result { let tempfile = NamedTempFile::with_suffix(with_suffix(format, comp))?; let mut handle = compression_passthrough(tempfile.path(), comp)?; - let mut seqgen = nucgen::Sequence::with_capacity(slen); + let mut seq = Vec::with_capacity(slen); let mut rng = rand::rng(); let qual = vec![b'?'; slen]; for idx in 0..nrec { - seqgen.clear_buffer(); - seqgen.fill_buffer(&mut rng, slen); - let seq = seqgen.bytes(); + fill_nucleotide_buffer(&mut seq, &mut rng, slen); match format { - FastxFormat::Fastq => write_fastq_to(&mut handle, idx, seq, &qual)?, - FastxFormat::Fasta => write_fasta_to(&mut handle, idx, seq)?, + FastxFormat::Fastq => write_fastq_to(&mut handle, idx, &seq, &qual)?, + FastxFormat::Fasta => write_fasta_to(&mut handle, idx, &seq)?, } } handle.flush()?;