2424!*/
2525
2626use crate :: types:: * ;
27- use regex :: Regex ;
27+ use fancy_regex :: Regex ;
2828use regex_automata:: util:: lazy:: Lazy ;
2929
3030#[ derive( PartialEq ) ]
@@ -84,9 +84,9 @@ const STRING: &str = r#"^\"(?<content>.*?(\\)*?)\"(?<caseinvariant>i?)"#;
8484const STRING_RE : Lazy < Regex > = Lazy :: new ( || Regex :: new ( STRING ) . unwrap ( ) ) ;
8585const NL : & str = r"^(\r?\n)+\s*" ;
8686const NL_RE : Lazy < Regex > = Lazy :: new ( || Regex :: new ( NL ) . unwrap ( ) ) ;
87- const REGEXP : & str = r#"^\/(?<pattern>(\\\/|\\\\|[^\/])*?)\/(?<flags>[imslux]*)"# ;
87+ const REGEXP : & str = r#"^\/(?!\/)(? <pattern>(\\\/|\\\\|[^\/])*?)\/(?<flags>[imslux]*)"# ;
8888const REGEXP_RE : Lazy < Regex > = Lazy :: new ( || Regex :: new ( REGEXP ) . unwrap ( ) ) ;
89- const OP : & str = r"^[+\*\?] " ;
89+ const OP : & str = r"^[+\*]|[?](?![a-z]) " ;
9090const OP_RE : Lazy < Regex > = Lazy :: new ( || Regex :: new ( OP ) . unwrap ( ) ) ;
9191const VBAR : & str = r"^((\r?\n)+\s*)?\|" ;
9292const VBAR_RE : Lazy < Regex > = Lazy :: new ( || Regex :: new ( VBAR ) . unwrap ( ) ) ;
@@ -138,9 +138,15 @@ impl EBNFParser {
138138 // Advance to next non-whitespace (or comment) in the input.
139139 self . consume_space ( ) ;
140140 // item: rule | token
141- if RULE_RE . is_match ( & self . input_string [ self . cur_pos ..] ) {
141+ if RULE_RE
142+ . is_match ( & self . input_string [ self . cur_pos ..] )
143+ . unwrap ( )
144+ {
142145 self . parse_rule ( ) ;
143- } else if TOKEN_RE . is_match ( & self . input_string [ self . cur_pos ..] ) {
146+ } else if TOKEN_RE
147+ . is_match ( & self . input_string [ self . cur_pos ..] )
148+ . unwrap ( )
149+ {
144150 self . parse_token ( ) ;
145151 } else if self . peek ( 0 ) == Some ( '%' ) {
146152 self . parse_statement ( ) ;
@@ -174,8 +180,10 @@ impl EBNFParser {
174180 // We know there's a match because this method is only called after
175181 // we've already determined that the next thing is a rule.
176182 // Remove the leading ? or !, if any, because we don't care about it.
183+ self . consume_space ( ) ;
177184 let rule_match = RULE_RE
178185 . captures ( & self . input_string [ self . cur_pos ..] )
186+ . unwrap ( )
179187 . unwrap ( ) ;
180188 // eprintln!("{:?}", rule_match);
181189 self . cur_parsing = Item :: RULE ;
@@ -210,10 +218,14 @@ impl EBNFParser {
210218 /// Note that the params that are present in the syntax have already been
211219 /// expanded by the time this procedure is called.
212220 fn parse_token ( & mut self ) {
213- let token_match = TOKEN_RE . find ( & self . input_string [ self . cur_pos ..] ) . unwrap ( ) ;
221+ self . consume_space ( ) ;
222+ let token_match = TOKEN_RE
223+ . find ( & self . input_string [ self . cur_pos ..] )
224+ . unwrap ( )
225+ . unwrap ( ) ;
214226
215227 self . name_stack . push ( token_match. as_str ( ) . into ( ) ) ;
216- self . consume ( token_match. len ( ) ) ;
228+ self . consume ( token_match. as_str ( ) . len ( ) ) ;
217229
218230 if self . peek ( 0 ) == Some ( '.' ) {
219231 self . parse_priority ( ) ;
@@ -283,7 +295,7 @@ impl EBNFParser {
283295 "%ig" => {
284296 self . consume ( "%ignore" . len ( ) ) ;
285297 self . consume_space ( ) ;
286- let Some ( token) = TOKEN_RE . find ( & self . input_string [ self . cur_pos ..] ) else {
298+ let Ok ( Some ( token) ) = TOKEN_RE . find ( & self . input_string [ self . cur_pos ..] ) else {
287299 self . report_parse_error ( "Expected token after %ignore statement." ) ;
288300 return ;
289301 } ;
@@ -312,7 +324,7 @@ impl EBNFParser {
312324 self . report_parse_error ( "Expected '.'." ) ;
313325 }
314326
315- let Some ( number_match) = NUMBER_RE . find ( & self . input_string [ self . cur_pos ..] ) else {
327+ let Ok ( Some ( number_match) ) = NUMBER_RE . find ( & self . input_string [ self . cur_pos ..] ) else {
316328 // Add a return statement to make the compiler happy, even though
317329 // this call never returns.
318330 self . report_parse_error ( "Expected a number." ) ;
@@ -322,7 +334,7 @@ impl EBNFParser {
322334 // Parsing as an integer *should* never fail, since we've just matched
323335 // something that must be a valid number.
324336 self . cur_priority = number_match. as_str ( ) . parse :: < i32 > ( ) . unwrap ( ) ;
325- self . consume ( number_match. len ( ) ) ;
337+ self . consume ( number_match. as_str ( ) . len ( ) ) ;
326338 }
327339
328340 /// Parse the expansions of the rule.
@@ -348,15 +360,18 @@ impl EBNFParser {
348360
349361 // If there's another alias, parse it.
350362 // self.consume_space();
351- if VBAR_RE . is_match ( & self . input_string [ self . cur_pos ..] ) {
352- // eprintln!("VBAR match");
353- // Advance past the VBAR to the beginning of the next alias.
354- self . consume_space ( ) ;
355- self . consume ( 1 ) ;
356- continue ;
357- } else {
358- // eprintln!("VBAR no match");
359- break self . name_stack . last ( ) . unwrap ( ) . to_string ( ) ;
363+ match VBAR_RE . is_match ( & self . input_string [ self . cur_pos ..] ) {
364+ Ok ( true ) => {
365+ // eprintln!("VBAR match");
366+ // Advance past the VBAR to the beginning of the next alias.
367+ self . consume_space ( ) ;
368+ self . consume ( 1 ) ;
369+ continue ;
370+ }
371+ _ => {
372+ // eprintln!("VBAR no match");
373+ break self . name_stack . last ( ) . unwrap ( ) . to_string ( ) ;
374+ }
360375 }
361376 }
362377 }
@@ -391,7 +406,9 @@ impl EBNFParser {
391406 self . consume_space ( ) ;
392407 res. push ( self . parse_expression ( ) ) ;
393408 self . consume_space ( ) ;
394- if VBAR_RE . is_match ( & self . input_string [ self . cur_pos ..] )
409+ if VBAR_RE
410+ . is_match ( & self . input_string [ self . cur_pos ..] )
411+ . unwrap ( )
395412 || self . peek ( 0 ) == Some ( ')' )
396413 || self . peek ( 0 ) == Some ( ']' )
397414 || ( self . peek ( 0 ) == Some ( '-' ) && self . peek ( 1 ) == Some ( '>' ) )
@@ -415,73 +432,70 @@ impl EBNFParser {
415432 let new_atom = self . parse_atom ( ) ;
416433 // eprintln!("parse_expression: {}", new_atom);
417434 // eprintln!("cur_pos: {}", self.cur_pos);
418- if OP_RE . is_match ( & self . input_string [ self . cur_pos ..] )
419- && !( self . peek ( 0 ) == Some ( '?' )
420- // Use a default value that will evaluate to false to avoid
421- // panicking in the case where the question mark is the last
422- // character of the input.
423- && self . peek ( 1 ) . unwrap_or ( '!' ) . is_ascii_lowercase ( ) )
424- {
425- // We don't get lookahead in this regex library, but the lark
426- // grammar has the lookahead (?![a-z]) after the question mark
427- // operator, which makes sure we don't treat the question mark at
428- // the beginning of an identifier as a question mark
429- // operator. Instead, we manually implement something like this lookahead.
430- // TOOD: Switch to fancy-regex to support lookaround.
431- // eprintln!("Match on OP_RE");
432- let new_nonterm = self . new_nonterminal ( "expression" ) ;
433- match self . peek ( 0 ) . unwrap ( ) {
434- '*' => {
435- // Convert every repetition E* to a fresh non-terminal X
436- // and add X = $\epsilon$ | X E.
437- self . grammar . productions . push ( Production {
438- lhs : new_nonterm. clone ( ) ,
439- rhs : vec ! [ "" . to_string( ) ] ,
440- } ) ;
441- self . grammar . productions . push ( Production {
442- lhs : new_nonterm. clone ( ) ,
443- rhs : vec ! [ new_nonterm. clone( ) , new_atom] ,
444- } ) ;
445- }
446- '+' => {
447- // Convert every at-least-one repetition E+ to a fresh
448- // non-terminal X and add X = E | X E.
449- self . grammar . productions . push ( Production {
450- lhs : new_nonterm. clone ( ) ,
451- rhs : vec ! [ new_atom. clone( ) ] ,
452- } ) ;
453- self . grammar . productions . push ( Production {
454- lhs : new_nonterm. clone ( ) ,
455- rhs : vec ! [ new_nonterm. clone( ) , new_atom. clone( ) ] ,
456- } ) ;
457- }
458- '?' => {
459- // Convert every option E? to a fresh non-terminal X and
460- // add X = $\epsilon$ | E.
461- self . grammar . productions . push ( Production {
462- lhs : new_nonterm. clone ( ) ,
463- rhs : vec ! [ "" . to_string( ) ] ,
464- } ) ;
465- self . grammar . productions . push ( Production {
466- lhs : new_nonterm. clone ( ) ,
467- rhs : vec ! [ new_atom. clone( ) ] ,
468- } ) ;
435+ match OP_RE . is_match ( & self . input_string [ self . cur_pos ..] ) {
436+ Ok ( true ) => {
437+ // We don't get lookahead in this regex library, but the lark
438+ // grammar has the lookahead (?![a-z]) after the question mark
439+ // operator, which makes sure we don't treat the question mark at
440+ // the beginning of an identifier as a question mark
441+ // operator. Instead, we manually implement something like this lookahead.
442+ // TOOD: Switch to fancy-regex to support lookaround.
443+ // eprintln!("Match on OP_RE");
444+ let new_nonterm = self . new_nonterminal ( "expression" ) ;
445+ match self . peek ( 0 ) . unwrap ( ) {
446+ '*' => {
447+ // Convert every repetition E* to a fresh non-terminal X
448+ // and add X = $\epsilon$ | X E.
449+ self . grammar . productions . push ( Production {
450+ lhs : new_nonterm. clone ( ) ,
451+ rhs : vec ! [ "" . to_string( ) ] ,
452+ } ) ;
453+ self . grammar . productions . push ( Production {
454+ lhs : new_nonterm. clone ( ) ,
455+ rhs : vec ! [ new_nonterm. clone( ) , new_atom] ,
456+ } ) ;
457+ }
458+ '+' => {
459+ // Convert every at-least-one repetition E+ to a fresh
460+ // non-terminal X and add X = E | X E.
461+ self . grammar . productions . push ( Production {
462+ lhs : new_nonterm. clone ( ) ,
463+ rhs : vec ! [ new_atom. clone( ) ] ,
464+ } ) ;
465+ self . grammar . productions . push ( Production {
466+ lhs : new_nonterm. clone ( ) ,
467+ rhs : vec ! [ new_nonterm. clone( ) , new_atom. clone( ) ] ,
468+ } ) ;
469+ }
470+ '?' => {
471+ // Convert every option E? to a fresh non-terminal X and
472+ // add X = $\epsilon$ | E.
473+ self . grammar . productions . push ( Production {
474+ lhs : new_nonterm. clone ( ) ,
475+ rhs : vec ! [ "" . to_string( ) ] ,
476+ } ) ;
477+ self . grammar . productions . push ( Production {
478+ lhs : new_nonterm. clone ( ) ,
479+ rhs : vec ! [ new_atom. clone( ) ] ,
480+ } ) ;
481+ }
482+ _ => { /* We should never reach this because we already matched above. */ }
469483 }
470- _ => { /* We should never reach this because we already matched above. */ }
484+ self . consume ( 1 ) ;
485+ self . grammar . symbol_set . push ( new_nonterm. clone ( ) ) ;
486+ return new_nonterm;
487+ // TODO: Add support for range repeats.
488+ // } else if self.peek(0) == Some('~') {
489+ // // The following is a range of some kind.
490+ // self.consume(1);
491+ // self.consume_space();
492+ // // self.handle_range();
493+ }
494+ _ => {
495+ // eprintln!("No match on OP_RE");
496+ // An unquantified atom.
497+ new_atom
471498 }
472- self . consume ( 1 ) ;
473- self . grammar . symbol_set . push ( new_nonterm. clone ( ) ) ;
474- return new_nonterm;
475- // TODO: Add support for range repeats.
476- // } else if self.peek(0) == Some('~') {
477- // // The following is a range of some kind.
478- // self.consume(1);
479- // self.consume_space();
480- // // self.handle_range();
481- } else {
482- // eprintln!("No match on OP_RE");
483- // An unquantified atom.
484- new_atom
485499 }
486500 }
487501
@@ -553,7 +567,7 @@ impl EBNFParser {
553567 let input_string = self . input_string . clone ( ) ;
554568 // self.consume_space();
555569
556- let Some ( matched_string) = STRING_RE . captures ( & input_string[ self . cur_pos ..] ) else {
570+ let Ok ( Some ( matched_string) ) = STRING_RE . captures ( & input_string[ self . cur_pos ..] ) else {
557571 return self . report_parse_error ( "String ill-formed." ) ;
558572 } ;
559573
@@ -568,7 +582,7 @@ impl EBNFParser {
568582 if self . peek ( 0 ) == Some ( '.' ) && self . peek ( 1 ) == Some ( '.' ) {
569583 // Literal range!
570584 self . consume ( 2 ) ;
571- let Some ( second_matched_string) = STRING_RE . captures ( & input_string[ self . cur_pos ..] )
585+ let Ok ( Some ( second_matched_string) ) = STRING_RE . captures ( & input_string[ self . cur_pos ..] )
572586 else {
573587 return self . report_parse_error ( "String ill-formed." ) ;
574588 } ;
@@ -614,16 +628,27 @@ impl EBNFParser {
614628 /// `name: RULE | TOKEN`
615629 fn parse_name ( & mut self ) -> String {
616630 let input_string = self . input_string . clone ( ) ;
617- if RULE_RE . is_match ( & self . input_string [ self . cur_pos ..] ) {
618- let rule_match = RULE_RE . find ( & input_string[ self . cur_pos ..] ) . unwrap ( ) ;
619- self . consume ( rule_match. len ( ) ) ;
631+ self . consume_space ( ) ;
632+ if RULE_RE
633+ . is_match ( & self . input_string [ self . cur_pos ..] )
634+ . unwrap ( )
635+ {
636+ let Some ( rule_match) = RULE_RE . find ( & input_string[ self . cur_pos ..] ) . unwrap ( ) else {
637+ panic ! ( )
638+ } ;
639+ self . consume ( rule_match. as_str ( ) . len ( ) ) ;
620640 self . grammar
621641 . symbol_set
622642 . push ( rule_match. as_str ( ) . to_string ( ) ) ;
623643 return rule_match. as_str ( ) . into ( ) ;
624- } else if TOKEN_RE . is_match ( & self . input_string [ self . cur_pos ..] ) {
625- let token_match = TOKEN_RE . find ( & input_string[ self . cur_pos ..] ) . unwrap ( ) ;
626- self . consume ( token_match. len ( ) ) ;
644+ } else if TOKEN_RE
645+ . is_match ( & self . input_string [ self . cur_pos ..] )
646+ . unwrap ( )
647+ {
648+ let Some ( token_match) = TOKEN_RE . find ( & input_string[ self . cur_pos ..] ) . unwrap ( ) else {
649+ panic ! ( )
650+ } ;
651+ self . consume ( token_match. as_str ( ) . len ( ) ) ;
627652 self . grammar
628653 . symbol_set
629654 . push ( token_match. as_str ( ) . to_string ( ) ) ;
@@ -646,7 +671,7 @@ impl EBNFParser {
646671 /// grammar and treat them as such.
647672 fn parse_regex ( & mut self ) -> String {
648673 let input_string = self . input_string . clone ( ) ;
649- let Some ( re_match) = REGEXP_RE . captures ( & input_string[ self . cur_pos ..] ) else {
674+ let Ok ( Some ( re_match) ) = REGEXP_RE . captures ( & input_string[ self . cur_pos ..] ) else {
650675 self . report_parse_error ( "Failed to parse regular expression." ) ;
651676 return "" . into ( ) ;
652677 } ;
0 commit comments