@@ -24,6 +24,22 @@ use crate::{
2424use anyhow:: Result ;
2525use binseq:: prelude:: * ;
2626
27+ /// Returns true if the pattern is a fixed DNA string (only ACGT).
28+ fn is_fixed ( pattern : & [ u8 ] ) -> bool {
29+ !pattern. is_empty ( )
30+ && pattern
31+ . iter ( )
32+ . all ( |b| matches ! ( b, b'A' | b'C' | b'G' | b'T' ) )
33+ }
34+
35+ /// Returns true if all patterns across multiple sets are fixed DNA strings.
36+ fn all_patterns_fixed ( pattern_sets : & [ & [ Vec < u8 > ] ] ) -> bool {
37+ pattern_sets
38+ . iter ( )
39+ . flat_map ( |s| s. iter ( ) )
40+ . all ( |p| is_fixed ( p) )
41+ }
42+
2743fn build_counter ( args : & GrepCommand ) -> Result < PatternCounter > {
2844 #[ cfg( feature = "fuzzy" ) ]
2945 if args. grep . fuzzy_args . fuzzy {
@@ -38,14 +54,17 @@ fn build_counter(args: &GrepCommand) -> Result<PatternCounter> {
3854 return Ok ( PatternCounter :: Fuzzy ( counter) ) ;
3955 }
4056
41- if args. grep . fixed {
42- let counter = AhoCorasickPatternCounter :: new (
43- args. grep . bytes_pat1 ( ) ?,
44- args. grep . bytes_pat2 ( ) ?,
45- args. grep . bytes_pat ( ) ?,
46- args. grep . no_dfa ,
47- args. grep . invert ,
48- ) ?;
57+ let pat1 = args. grep . bytes_pat1 ( ) ?;
58+ let pat2 = args. grep . bytes_pat2 ( ) ?;
59+ let pat = args. grep . bytes_pat ( ) ?;
60+ let use_fixed = args. grep . fixed || all_patterns_fixed ( & [ & pat1, & pat2, & pat] ) ;
61+ if !args. grep . fixed && use_fixed {
62+ log:: debug!( "All patterns are fixed strings — auto-selecting Aho-Corasick" ) ;
63+ }
64+
65+ if use_fixed {
66+ let counter =
67+ AhoCorasickPatternCounter :: new ( pat1, pat2, pat, args. grep . no_dfa , args. grep . invert ) ?;
4968 Ok ( PatternCounter :: AhoCorasick ( counter) )
5069 } else {
5170 let counter = RegexPatternCounter :: new (
@@ -89,17 +108,25 @@ fn build_matcher(args: &GrepCommand) -> Result<PatternMatcher> {
89108 return Ok ( PatternMatcher :: Fuzzy ( matcher) ) ;
90109 }
91110
92- if args. grep . fixed && !args. grep . and_logic ( ) {
111+ let pat1 = args. grep . bytes_pat1 ( ) ?;
112+ let pat2 = args. grep . bytes_pat2 ( ) ?;
113+ let pat = args. grep . bytes_pat ( ) ?;
114+ let use_fixed = args. grep . fixed || all_patterns_fixed ( & [ & pat1, & pat2, & pat] ) ;
115+ if !args. grep . fixed && use_fixed {
116+ log:: debug!( "All patterns are fixed strings — auto-selecting Aho-Corasick" ) ;
117+ }
118+
119+ if use_fixed && !args. grep . and_logic ( ) {
93120 let matcher = AhoCorasickMatcher :: new (
94- args . grep . bytes_pat1 ( ) ? ,
95- args . grep . bytes_pat2 ( ) ? ,
96- args . grep . bytes_pat ( ) ? ,
121+ pat1 ,
122+ pat2 ,
123+ pat ,
97124 args. grep . no_dfa ,
98125 args. grep . range . map_or ( 0 , |r| r. offset ( ) ) ,
99126 ) ?;
100127 Ok ( PatternMatcher :: AhoCorasick ( matcher) )
101128 } else {
102- if args . grep . fixed {
129+ if use_fixed {
103130 warn ! ( "`-x/--fixed provided but ignored when using AND logic" ) ;
104131 }
105132 let matcher = RegexMatcher :: new (
@@ -168,3 +195,63 @@ pub fn run(args: &GrepCommand) -> Result<()> {
168195 run_grep ( args, reader, writer, format, mate)
169196 }
170197}
198+
199+ #[ cfg( test) ]
200+ mod fixed_detection_tests {
201+ use super :: { all_patterns_fixed, is_fixed} ;
202+
203+ #[ test]
204+ fn test_fixed_dna_strings ( ) {
205+ assert ! ( is_fixed( b"ACGTACGT" ) ) ;
206+ assert ! ( is_fixed( b"AAAAAAAAAA" ) ) ;
207+ assert ! ( is_fixed( b"ACGT" ) ) ;
208+ }
209+
210+ #[ test]
211+ fn test_empty_string ( ) {
212+ assert ! ( !is_fixed( b"" ) ) ;
213+ }
214+
215+ #[ test]
216+ fn test_iupac_ambiguity_codes ( ) {
217+ assert ! ( !is_fixed( b"ACGTNRYW" ) ) ;
218+ assert ! ( !is_fixed( b"ACGN" ) ) ;
219+ }
220+
221+ #[ test]
222+ fn test_lowercase_not_fixed ( ) {
223+ assert ! ( !is_fixed( b"acgt" ) ) ;
224+ }
225+
226+ #[ test]
227+ fn test_regex_patterns_not_fixed ( ) {
228+ assert ! ( !is_fixed( b"AC.GT" ) ) ;
229+ assert ! ( !is_fixed( b"AC[GT]" ) ) ;
230+ assert ! ( !is_fixed( b"A{3}" ) ) ;
231+ assert ! ( !is_fixed( b"^ACGT" ) ) ;
232+ assert ! ( !is_fixed( b"ACG|TGA" ) ) ;
233+ assert ! ( !is_fixed( b"(ACG)" ) ) ;
234+ assert ! ( !is_fixed( b"AC\\ dGT" ) ) ;
235+ }
236+
237+ #[ test]
238+ fn test_all_patterns_fixed ( ) {
239+ let p1 = vec ! [ b"ACGT" . to_vec( ) , b"TTTT" . to_vec( ) ] ;
240+ let p2 = vec ! [ b"GGGG" . to_vec( ) ] ;
241+ assert ! ( all_patterns_fixed( & [ & p1, & p2] ) ) ;
242+ }
243+
244+ #[ test]
245+ fn test_all_patterns_fixed_with_regex ( ) {
246+ let p1 = vec ! [ b"ACGT" . to_vec( ) , b"AC.GT" . to_vec( ) ] ;
247+ let p2 = vec ! [ b"GGGG" . to_vec( ) ] ;
248+ assert ! ( !all_patterns_fixed( & [ & p1, & p2] ) ) ;
249+ }
250+
251+ #[ test]
252+ fn test_all_patterns_fixed_empty_sets ( ) {
253+ let p1: Vec < Vec < u8 > > = vec ! [ ] ;
254+ let p2: Vec < Vec < u8 > > = vec ! [ ] ;
255+ assert ! ( all_patterns_fixed( & [ & p1, & p2] ) ) ;
256+ }
257+ }
0 commit comments