Skip to content

Commit 302a08e

Browse files
authored
Merge pull request #166 from ArcInstitute/autodetermine-fixed-regex
Autodetermine fixed regex
2 parents a62a681 + 3f47c3d commit 302a08e

File tree

4 files changed

+135
-20
lines changed

4 files changed

+135
-20
lines changed

CLAUDE.md

Lines changed: 33 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -52,18 +52,45 @@ Build without defaults: `cargo build --no-default-features -F fuzzy,gcs`
5252

5353
### Core Dependencies
5454

55-
| Crate | Role |
56-
|-------|------|
57-
| `binseq` | BINSEQ format read/write |
58-
| `bitnuc` | 2-bit/4-bit nucleotide encoding |
55+
| Crate | Role |
56+
| --------- | -------------------------------- |
57+
| `binseq` | BINSEQ format read/write |
58+
| `bitnuc` | 2-bit/4-bit nucleotide encoding |
5959
| `paraseq` | Parallel FASTX/BINSEQ processing |
60-
| `clap` | CLI argument parsing (derive) |
61-
| `anyhow` | Error handling throughout |
60+
| `clap` | CLI argument parsing (derive) |
61+
| `anyhow` | Error handling throughout |
6262

6363
### Testing
6464

6565
Integration tests live in `tests/`. `tests/common.rs` provides a builder (`write_fastx()`) for generating random FASTQ/FASTA test data with configurable compression (none, gzip, zstd). Tests use cartesian products over format/compression/mode combinations. Dev dependencies: `bon` (builder macro), `nucgen` (random sequences), `tempfile`, `itertools`.
6666

67+
### Generating Test Data
68+
69+
Random FASTQ/FASTA test data can be created on the CLI with `nucgen` (`cargo install nucgen` if not already installed).
70+
71+
```bash
72+
# generates 10,000 reads of length 150
73+
nucgen -n 10000 -l 150 some.fq
74+
# generates 30,000 paired-reads of length 50 and 200
75+
nucgen -n 30000 -l 50 -L 200 some_R1.fq some_R2.fq
76+
```
77+
78+
These can then be ingested with `bqtools encode`:
79+
80+
```bash
81+
bqtools encode some.fq -o some.cbq
82+
bqtools encode some_R1.fq some_R2.fq -o some.cbq
83+
```
84+
85+
### Benchmarking Changes
86+
87+
Make use of `hyperfine` (`cargo install hyperfine` if not already installed) to measure performance of binaries after changes.
88+
89+
```bash
90+
# Measures decoding performance
91+
hyperfine --warmup 3 --runs 10 "bqtools decode some.cbq > /dev/null"
92+
```
93+
6794
## Contribution Guide
6895

6996
When making changes, keep the following documentation in sync:

Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[package]
22
name = "bqtools"
3-
version = "0.5.2"
3+
version = "0.5.3"
44
edition = "2021"
55
license = "MIT"
66
authors = ["Noam Teyssier <[email protected]>"]

src/cli/grep.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -79,6 +79,7 @@ pub struct GrepArgs {
7979
/// Denotes patterns are fixed strings (non-regex)
8080
///
8181
/// Allows usage of Aho-Corasick algorithm for efficient matching.
82+
/// This is auto-detected when all patterns are literal strings.
8283
#[clap(short = 'x', long)]
8384
pub fixed: bool,
8485

src/commands/grep/mod.rs

Lines changed: 100 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,22 @@ use crate::{
2424
use anyhow::Result;
2525
use binseq::prelude::*;
2626

27+
/// Returns true if the pattern is a fixed DNA string (only ACGT).
28+
fn is_fixed(pattern: &[u8]) -> bool {
29+
!pattern.is_empty()
30+
&& pattern
31+
.iter()
32+
.all(|b| matches!(b, b'A' | b'C' | b'G' | b'T'))
33+
}
34+
35+
/// Returns true if all patterns across multiple sets are fixed DNA strings.
36+
fn all_patterns_fixed(pattern_sets: &[&[Vec<u8>]]) -> bool {
37+
pattern_sets
38+
.iter()
39+
.flat_map(|s| s.iter())
40+
.all(|p| is_fixed(p))
41+
}
42+
2743
fn build_counter(args: &GrepCommand) -> Result<PatternCounter> {
2844
#[cfg(feature = "fuzzy")]
2945
if args.grep.fuzzy_args.fuzzy {
@@ -38,14 +54,17 @@ fn build_counter(args: &GrepCommand) -> Result<PatternCounter> {
3854
return Ok(PatternCounter::Fuzzy(counter));
3955
}
4056

41-
if args.grep.fixed {
42-
let counter = AhoCorasickPatternCounter::new(
43-
args.grep.bytes_pat1()?,
44-
args.grep.bytes_pat2()?,
45-
args.grep.bytes_pat()?,
46-
args.grep.no_dfa,
47-
args.grep.invert,
48-
)?;
57+
let pat1 = args.grep.bytes_pat1()?;
58+
let pat2 = args.grep.bytes_pat2()?;
59+
let pat = args.grep.bytes_pat()?;
60+
let use_fixed = args.grep.fixed || all_patterns_fixed(&[&pat1, &pat2, &pat]);
61+
if !args.grep.fixed && use_fixed {
62+
log::debug!("All patterns are fixed strings — auto-selecting Aho-Corasick");
63+
}
64+
65+
if use_fixed {
66+
let counter =
67+
AhoCorasickPatternCounter::new(pat1, pat2, pat, args.grep.no_dfa, args.grep.invert)?;
4968
Ok(PatternCounter::AhoCorasick(counter))
5069
} else {
5170
let counter = RegexPatternCounter::new(
@@ -89,17 +108,25 @@ fn build_matcher(args: &GrepCommand) -> Result<PatternMatcher> {
89108
return Ok(PatternMatcher::Fuzzy(matcher));
90109
}
91110

92-
if args.grep.fixed && !args.grep.and_logic() {
111+
let pat1 = args.grep.bytes_pat1()?;
112+
let pat2 = args.grep.bytes_pat2()?;
113+
let pat = args.grep.bytes_pat()?;
114+
let use_fixed = args.grep.fixed || all_patterns_fixed(&[&pat1, &pat2, &pat]);
115+
if !args.grep.fixed && use_fixed {
116+
log::debug!("All patterns are fixed strings — auto-selecting Aho-Corasick");
117+
}
118+
119+
if use_fixed && !args.grep.and_logic() {
93120
let matcher = AhoCorasickMatcher::new(
94-
args.grep.bytes_pat1()?,
95-
args.grep.bytes_pat2()?,
96-
args.grep.bytes_pat()?,
121+
pat1,
122+
pat2,
123+
pat,
97124
args.grep.no_dfa,
98125
args.grep.range.map_or(0, |r| r.offset()),
99126
)?;
100127
Ok(PatternMatcher::AhoCorasick(matcher))
101128
} else {
102-
if args.grep.fixed {
129+
if use_fixed {
103130
warn!("`-x/--fixed provided but ignored when using AND logic");
104131
}
105132
let matcher = RegexMatcher::new(
@@ -168,3 +195,63 @@ pub fn run(args: &GrepCommand) -> Result<()> {
168195
run_grep(args, reader, writer, format, mate)
169196
}
170197
}
198+
199+
#[cfg(test)]
200+
mod fixed_detection_tests {
201+
use super::{all_patterns_fixed, is_fixed};
202+
203+
#[test]
204+
fn test_fixed_dna_strings() {
205+
assert!(is_fixed(b"ACGTACGT"));
206+
assert!(is_fixed(b"AAAAAAAAAA"));
207+
assert!(is_fixed(b"ACGT"));
208+
}
209+
210+
#[test]
211+
fn test_empty_string() {
212+
assert!(!is_fixed(b""));
213+
}
214+
215+
#[test]
216+
fn test_iupac_ambiguity_codes() {
217+
assert!(!is_fixed(b"ACGTNRYW"));
218+
assert!(!is_fixed(b"ACGN"));
219+
}
220+
221+
#[test]
222+
fn test_lowercase_not_fixed() {
223+
assert!(!is_fixed(b"acgt"));
224+
}
225+
226+
#[test]
227+
fn test_regex_patterns_not_fixed() {
228+
assert!(!is_fixed(b"AC.GT"));
229+
assert!(!is_fixed(b"AC[GT]"));
230+
assert!(!is_fixed(b"A{3}"));
231+
assert!(!is_fixed(b"^ACGT"));
232+
assert!(!is_fixed(b"ACG|TGA"));
233+
assert!(!is_fixed(b"(ACG)"));
234+
assert!(!is_fixed(b"AC\\dGT"));
235+
}
236+
237+
#[test]
238+
fn test_all_patterns_fixed() {
239+
let p1 = vec![b"ACGT".to_vec(), b"TTTT".to_vec()];
240+
let p2 = vec![b"GGGG".to_vec()];
241+
assert!(all_patterns_fixed(&[&p1, &p2]));
242+
}
243+
244+
#[test]
245+
fn test_all_patterns_fixed_with_regex() {
246+
let p1 = vec![b"ACGT".to_vec(), b"AC.GT".to_vec()];
247+
let p2 = vec![b"GGGG".to_vec()];
248+
assert!(!all_patterns_fixed(&[&p1, &p2]));
249+
}
250+
251+
#[test]
252+
fn test_all_patterns_fixed_empty_sets() {
253+
let p1: Vec<Vec<u8>> = vec![];
254+
let p2: Vec<Vec<u8>> = vec![];
255+
assert!(all_patterns_fixed(&[&p1, &p2]));
256+
}
257+
}

0 commit comments

Comments
 (0)