Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 3 additions & 4 deletions Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "bqtools"
version = "0.5.4"
version = "0.5.5"
edition = "2021"
license = "MIT"
authors = ["Noam Teyssier <noam.teyssier@arcinstitute.org>"]
Expand All @@ -27,17 +27,16 @@ nix = { version = "0.31.1", features = ["fs"] }
num_cpus = "1.17.0"
paraseq = "0.4.8"
parking_lot = "0.12.5"
rand = "0.9.2"
rand = "0.10.0"
regex = "1.12.2"
sassy = { version = "0.1.10", optional = true }
sassy = { version = "0.2.1", optional = true }
serde = { version = "1.0.228", features = ["derive"] }
walkdir = "2.5.0"
zstd = { version = "0.13.3", features = ["zstdmt"] }

[dev-dependencies]
bon = "3.8.2"
itertools = "0.14.0"
nucgen = "0.2.0"
tempfile = "3.24.0"

[features]
Expand Down
10 changes: 5 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -209,13 +209,13 @@ It will then balance the provided file/file pairs among the thread pool to ensur
All options provided by `bqtools encode` will be passed through to the sub-encoders.

```bash
# Encode all FASTX files as BQ
bqtools encode --recursive --mode bq ./
# Encode all FASTX files as CBQ
bqtools encode --recursive --mode cbq ./

# Encode all paired FASTX files as VBQ and index their output
bqtools encode --recursive --paired --mode vbq --index ./
# Encode all paired FASTX files as VBQ
bqtools encode --recursive --paired --mode vbq ./

# Encode recursively with a max-subdirectory depth of 2
# Encode as BQ recursively with a max-subdirectory depth of 2
bqtools encode --recursive --mode bq --depth 2 ./
```

Expand Down
197 changes: 135 additions & 62 deletions src/commands/grep/filter/fuzzy_matcher.rs
Original file line number Diff line number Diff line change
@@ -1,17 +1,41 @@
use super::{MatchRanges, PatternMatch};

use sassy::{profiles::Iupac, Searcher};
use anyhow::Result;
use fixedbitset::FixedBitSet;
use log::error;
use sassy::{profiles::Iupac, EncodedPatterns, Searcher};

type Profile = Iupac;
type Patterns = Vec<Vec<u8>>;

#[derive(Clone)]
pub struct FuzzyMatcher {
pat1: Patterns,
pat2: Patterns,
pat: Patterns,
k: usize, // maximum edit distance to accept
inexact: bool, // whether to only report inexact matches
offset: usize, // Left-offset relevant for range matching
/// Encoded patterns for the first pattern collection
pat1: Option<EncodedPatterns<Profile>>,
/// Encoded patterns for the second pattern collection
pat2: Option<EncodedPatterns<Profile>>,
/// Encoded patterns for the shared pattern collection
pat: Option<EncodedPatterns<Profile>>,

/// Maximum edit distance to accept
k: usize,
/// Whether to only report inexact matches
inexact: bool,
/// Left-offset relevant for range matching
offset: usize,

/// Fixed-bitset for pat1
bs1: FixedBitSet,
/// Fixed-bitset for pat2
bs2: FixedBitSet,
/// Fixed-bitset for pat
bs: FixedBitSet,

/// Primary sequence pattern searcher
searcher_1: Searcher<Iupac>,
/// Secondary sequence pattern searcher
searcher_2: Searcher<Iupac>,
/// Shared sequence pattern searcher
searcher: Searcher<Iupac>,
}

Expand All @@ -23,36 +47,66 @@ impl FuzzyMatcher {
k: usize,
inexact: bool,
offset: usize,
) -> Self {
Self {
pat1,
pat2,
pat,
) -> Result<Self> {
// initialize a searcher for each pattern collection
let mut searcher_1 = Searcher::new_fwd();
let mut searcher_2 = Searcher::new_fwd();
let mut searcher = Searcher::new_fwd();

// validate pattern lengths
validate_single_pattern_length(&pat1)?;
validate_single_pattern_length(&pat2)?;
validate_single_pattern_length(&pat)?;

// encode the patterns for each collection/searcher combination
let enc_pat1 = (!pat1.is_empty()).then(|| searcher_1.encode_patterns(&pat1));
let enc_pat2 = (!pat2.is_empty()).then(|| searcher_2.encode_patterns(&pat2));
let enc_pat = (!pat.is_empty()).then(|| searcher.encode_patterns(&pat));

// initialize fixed-bitsets for each pattern collection
let bs1 = FixedBitSet::with_capacity(pat1.len());
let bs2 = FixedBitSet::with_capacity(pat2.len());
let bs = FixedBitSet::with_capacity(pat.len());

Ok(Self {
pat1: enc_pat1,
pat2: enc_pat2,
pat: enc_pat,
k,
inexact,
offset,
searcher: Searcher::new_fwd(),
}
bs1,
bs2,
bs,
searcher_1,
searcher_2,
searcher,
})
}
}

fn find_and_insert_matches(
pat: &[u8],
patterns: &EncodedPatterns<Profile>,
sequence: &[u8],
matches: &mut MatchRanges,
searcher: &mut Searcher<Iupac>,
bitset: &mut FixedBitSet,
k: usize,
inexact: bool,
offset: usize,
) -> bool {
let mut found = false;
for mat in searcher.search(pat, &sequence, k) {
if inexact && mat.cost == 0 {
continue;
}
matches.insert((mat.text_start + offset, mat.text_end + offset));
found = true;
}
searcher
.search_encoded_patterns(patterns, sequence, k)
.iter()
.for_each(|m| {
if inexact && m.cost == 0 {
return;
}
matches.insert((m.text_start + offset, m.text_end + offset));
bitset.set(m.pattern_idx, true);
found = true;
});
found
}

Expand All @@ -67,25 +121,26 @@ impl PatternMatch for FuzzyMatcher {
matches: &mut MatchRanges,
and_logic: bool,
) -> bool {
if self.pat1.is_empty() {
return true;
}
let offset = self.offset();
let closure = |pat: &Vec<u8>| {
find_and_insert_matches(
pat,
if let Some(ref epat) = self.pat1 {
self.bs1.clear();
let offset = self.offset();
let has_any_match = find_and_insert_matches(
epat,
sequence,
matches,
&mut self.searcher,
&mut self.searcher_1,
&mut self.bs1,
self.k,
self.inexact,
offset,
)
};
if and_logic {
self.pat1.iter().all(closure)
);
if and_logic {
has_any_match && self.bs1.is_full()
} else {
has_any_match
}
} else {
self.pat1.iter().any(closure)
true
}
}

Expand All @@ -95,25 +150,26 @@ impl PatternMatch for FuzzyMatcher {
matches: &mut MatchRanges,
and_logic: bool,
) -> bool {
if self.pat2.is_empty() || sequence.is_empty() {
return true;
}
let offset = self.offset();
let closure = |pat: &Vec<u8>| {
find_and_insert_matches(
pat,
if let Some(ref epat) = self.pat2 {
self.bs2.clear();
let offset = self.offset();
let has_any_match = find_and_insert_matches(
epat,
sequence,
matches,
&mut self.searcher,
&mut self.searcher_2,
&mut self.bs2,
self.k,
self.inexact,
offset,
)
};
if and_logic {
self.pat2.iter().all(closure)
);
if and_logic {
has_any_match && self.bs2.is_full()
} else {
has_any_match
}
} else {
self.pat2.iter().any(closure)
true
}
}

Expand All @@ -125,35 +181,52 @@ impl PatternMatch for FuzzyMatcher {
xmatches: &mut MatchRanges,
and_logic: bool,
) -> bool {
if self.pat.is_empty() {
return true;
}
let offset = self.offset();
let closure = |pat: &Vec<u8>| {
let found_s = find_and_insert_matches(
pat,
if let Some(ref epat) = self.pat {
self.bs.clear();
let offset = self.offset();
let primary_has_any_match = find_and_insert_matches(
epat,
primary,
smatches,
&mut self.searcher,
&mut self.bs,
self.k,
self.inexact,
offset,
);
let found_x = find_and_insert_matches(
pat,
let secondary_has_any_match = find_and_insert_matches(
epat,
secondary,
xmatches,
&mut self.searcher,
&mut self.bs,
self.k,
self.inexact,
offset,
);
found_s || found_x // OR here because we want to match either primary or secondary
};
if and_logic {
self.pat.iter().all(closure)
let has_any_match = primary_has_any_match || secondary_has_any_match;
if and_logic {
has_any_match && self.bs.is_full()
} else {
has_any_match
}
} else {
self.pat.iter().any(closure)
true
}
}
}

fn validate_single_pattern_length(patterns: &Patterns) -> Result<()> {
if patterns.len() < 2 {
return Ok(());
}
let plen = patterns[0].len();
for pattern in patterns {
if pattern.len() != plen {
error!("Multiple pattern lengths provided - currently cannot handle variable-length patterns in fuzzy matching");
return Err(anyhow::anyhow!("Pattern length mismatch"));
}
}

Ok(())
}
Loading
Loading