1- use anyhow:: { bail, Result } ;
1+ use std:: {
2+ fs,
3+ io:: { self , Read } ,
4+ } ;
5+
6+ use anyhow:: Result ;
27use clap:: Parser ;
8+ use paraseq:: { fasta, Record } ;
39
4- use crate :: { cli:: FileFormat , commands:: grep:: SimpleRange } ;
10+ use crate :: {
11+ cli:: FileFormat ,
12+ commands:: grep:: { Pattern , PatternCollection , SimpleRange } ,
13+ } ;
514
615use super :: { InputBinseq , OutputFile } ;
716
@@ -121,51 +130,6 @@ impl GrepArgs {
121130 }
122131 Ok ( ( ) )
123132 }
124- fn chain_regex (
125- & self ,
126- cli_patterns : & [ String ] ,
127- filetype : PatternFileType ,
128- ) -> Result < Vec < regex:: bytes:: Regex > > {
129- let mut all_patterns = cli_patterns
130- . iter ( )
131- . map ( std:: borrow:: ToOwned :: to_owned)
132- . collect :: < Vec < String > > ( ) ;
133- if !self . file_args . empty_file ( filetype) {
134- all_patterns. extend ( self . file_args . read_file_patterns ( filetype) ?) ;
135- }
136-
137- // all patterns are kept separate for:
138- // 1. AND logic
139- // 2. Individual pattern counting
140- if self . and_logic ( ) || self . pattern_count {
141- Ok ( all_patterns
142- . iter ( )
143- . map ( |s| {
144- regex:: bytes:: Regex :: new ( s) . expect ( "Could not build regex from pattern: {s}" )
145- } )
146- . collect ( ) )
147-
148- // for OR logic they can be compiled into a single regex for performance
149- } else {
150- let global_pattern = all_patterns. join ( "|" ) ;
151- if global_pattern. is_empty ( ) {
152- Ok ( vec ! [ ] )
153- } else {
154- Ok ( vec ! [ regex:: bytes:: Regex :: new( & global_pattern) . expect(
155- "Could not build regex from pattern: {global_pattern}" ,
156- ) ] )
157- }
158- }
159- }
160- pub fn bytes_reg1 ( & self ) -> Result < Vec < regex:: bytes:: Regex > > {
161- self . chain_regex ( & self . reg1 , PatternFileType :: SFile )
162- }
163- pub fn bytes_reg2 ( & self ) -> Result < Vec < regex:: bytes:: Regex > > {
164- self . chain_regex ( & self . reg2 , PatternFileType :: XFile )
165- }
166- pub fn bytes_reg ( & self ) -> Result < Vec < regex:: bytes:: Regex > > {
167- self . chain_regex ( & self . reg , PatternFileType :: File )
168- }
169133 pub fn and_logic ( & self ) -> bool {
170134 if self . file_args . empty ( ) {
171135 !self . or_logic
@@ -177,27 +141,30 @@ impl GrepArgs {
177141}
178142
179143impl GrepArgs {
180- fn chain_bytes (
144+ fn chain_patterns (
181145 & self ,
182146 cli_patterns : & [ String ] ,
183147 filetype : PatternFileType ,
184- ) -> Result < Vec < Vec < u8 > > > {
185- let bytes_iter = cli_patterns. iter ( ) . map ( |s| s. as_bytes ( ) . to_vec ( ) ) ;
148+ ) -> Result < PatternCollection > {
149+ let cli_iter = cli_patterns. iter ( ) . map ( |s| Pattern {
150+ name : None ,
151+ sequence : s. as_bytes ( ) . to_vec ( ) ,
152+ } ) ;
186153 if self . file_args . empty_file ( filetype) {
187- Ok ( bytes_iter . collect ( ) )
154+ Ok ( PatternCollection ( cli_iter . collect ( ) ) )
188155 } else {
189- let patterns = self . file_args . patterns ( filetype) ?;
190- Ok ( bytes_iter . chain ( patterns ) . collect ( ) )
156+ let file_patterns = self . file_args . patterns ( filetype) ?;
157+ Ok ( PatternCollection ( cli_iter . chain ( file_patterns ) . collect ( ) ) )
191158 }
192159 }
193- pub fn bytes_pat1 ( & self ) -> Result < Vec < Vec < u8 > > > {
194- self . chain_bytes ( & self . reg1 , PatternFileType :: SFile )
160+ pub fn patterns_m1 ( & self ) -> Result < PatternCollection > {
161+ self . chain_patterns ( & self . reg1 , PatternFileType :: SFile )
195162 }
196- pub fn bytes_pat2 ( & self ) -> Result < Vec < Vec < u8 > > > {
197- self . chain_bytes ( & self . reg2 , PatternFileType :: XFile )
163+ pub fn patterns_m2 ( & self ) -> Result < PatternCollection > {
164+ self . chain_patterns ( & self . reg2 , PatternFileType :: XFile )
198165 }
199- pub fn bytes_pat ( & self ) -> Result < Vec < Vec < u8 > > > {
200- self . chain_bytes ( & self . reg , PatternFileType :: File )
166+ pub fn patterns ( & self ) -> Result < PatternCollection > {
167+ self . chain_patterns ( & self . reg , PatternFileType :: File )
201168 }
202169}
203170
@@ -229,28 +196,32 @@ pub struct FuzzyArgs {
229196pub struct PatternFileArgs {
230197 /// File of patterns to search for
231198 ///
232- /// This assumes one pattern per line.
199+ /// Accepts a plain text file (one pattern per line) or a FASTA file
200+ /// (sequences are used as patterns). FASTA files are auto-detected.
233201 /// Patterns may be regex or literal (fuzzy doesn't support regex).
234202 /// These will match against either primary or extended sequence.
235203 #[ clap( long) ]
236204 pub file : Option < String > ,
237205
238206 /// File of patterns to search for in primary sequence
239207 ///
240- /// This assumes one pattern per line.
208+ /// Accepts a plain text file (one pattern per line) or a FASTA file
209+ /// (sequences are used as patterns). FASTA files are auto-detected.
241210 /// Patterns may be regex or literal (fuzzy doesn't support regex).
242211 #[ clap( long) ]
243212 pub sfile : Option < String > ,
244213
245214 /// File of patterns to search for in extended sequence
246215 ///
247- /// This assumes one pattern per line.
216+ /// Accepts a plain text file (one pattern per line) or a FASTA file
217+ /// (sequences are used as patterns). FASTA files are auto-detected.
248218 /// Patterns may be regex or literal (fuzzy doesn't support regex).
249219 #[ clap( long) ]
250220 pub xfile : Option < String > ,
251221}
222+
252223impl PatternFileArgs {
253- fn empty ( & self ) -> bool {
224+ pub ( crate ) fn empty ( & self ) -> bool {
254225 self . file . is_none ( ) && self . sfile . is_none ( ) && self . xfile . is_none ( )
255226 }
256227
@@ -262,34 +233,62 @@ impl PatternFileArgs {
262233 }
263234 }
264235
265- fn read_file ( & self , filetype : PatternFileType ) -> Result < String > {
236+ fn file_path ( & self , filetype : PatternFileType ) -> Result < & str > {
266237 let file = match filetype {
267238 PatternFileType :: File => & self . file ,
268239 PatternFileType :: SFile => & self . sfile ,
269240 PatternFileType :: XFile => & self . xfile ,
270241 } ;
271- if let Some ( file) = file {
272- Ok ( std:: fs:: read_to_string ( file) ?)
273- } else {
274- bail ! ( "Specified file type {filetype:?} not provided at CLI" )
275- }
242+ file. as_deref ( )
243+ . ok_or_else ( || anyhow:: anyhow!( "Specified file type {filetype:?} not provided at CLI" ) )
276244 }
277245
278- fn read_file_patterns ( & self , filetype : PatternFileType ) -> Result < Vec < String > > {
279- let contents = self . read_file ( filetype) ?;
280- Ok ( contents
281- . lines ( )
282- . map ( std:: string:: ToString :: to_string)
283- . collect ( ) )
246+ /// Returns true if the file starts with '>' (FASTA format).
247+ fn is_fasta ( path : & str ) -> Result < bool > {
248+ let file = fs:: File :: open ( path) ?;
249+ // only take up to 10 bytes to determine fasta status
250+ for byte in io:: BufReader :: new ( file) . bytes ( ) . take ( 10 ) {
251+ let b = byte?;
252+ if b != b'\n' && b != b'\r' {
253+ return Ok ( b == b'>' ) ;
254+ }
255+ }
256+ Ok ( false )
284257 }
285258
286- fn patterns ( & self , filetype : PatternFileType ) -> Result < Vec < Vec < u8 > > > {
287- let contents = self . read_file ( filetype) ?;
288- let mut patterns = Vec :: new ( ) ;
289- for line in contents. lines ( ) {
290- patterns. push ( line. as_bytes ( ) . to_vec ( ) ) ;
259+ /// Load patterns from a file, auto-detecting FASTA vs plain text.
260+ fn load_patterns ( path : & str ) -> Result < Vec < Pattern > > {
261+ if Self :: is_fasta ( path) ? {
262+ let mut reader = fasta:: Reader :: from_path ( path) ?;
263+ let mut rset = fasta:: RecordSet :: default ( ) ;
264+ let mut patterns = Vec :: new ( ) ;
265+
266+ while rset. fill ( & mut reader) ? {
267+ for record in rset. iter ( ) {
268+ let record = record?;
269+ patterns. push ( Pattern {
270+ name : Some ( record. id_str ( ) . to_string ( ) ) ,
271+ sequence : record. seq ( ) . into_owned ( ) ,
272+ } ) ;
273+ }
274+ }
275+
276+ Ok ( patterns)
277+ } else {
278+ let contents = std:: fs:: read_to_string ( path) ?;
279+ Ok ( contents
280+ . lines ( )
281+ . map ( |line| Pattern {
282+ name : None ,
283+ sequence : line. as_bytes ( ) . to_vec ( ) ,
284+ } )
285+ . collect ( ) )
291286 }
292- Ok ( patterns)
287+ }
288+
289+ fn patterns ( & self , filetype : PatternFileType ) -> Result < Vec < Pattern > > {
290+ let path = self . file_path ( filetype) ?;
291+ Self :: load_patterns ( path)
293292 }
294293}
295294
0 commit comments