Skip to content

Commit fb5f4ea

Browse files
committed
Play with fancy-regex.
1 parent b6d58c8 commit fb5f4ea

File tree

1 file changed

+119
-94
lines changed

1 file changed

+119
-94
lines changed

src/grammar.rs

Lines changed: 119 additions & 94 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ TODO:
2424
!*/
2525

2626
use crate::types::*;
27-
use regex::Regex;
27+
use fancy_regex::Regex;
2828
use regex_automata::util::lazy::Lazy;
2929

3030
#[derive(PartialEq)]
@@ -84,9 +84,9 @@ const STRING: &str = r#"^\"(?<content>.*?(\\)*?)\"(?<caseinvariant>i?)"#;
8484
const STRING_RE: Lazy<Regex> = Lazy::new(|| Regex::new(STRING).unwrap());
8585
const NL: &str = r"^(\r?\n)+\s*";
8686
const NL_RE: Lazy<Regex> = Lazy::new(|| Regex::new(NL).unwrap());
87-
const REGEXP: &str = r#"^\/(?<pattern>(\\\/|\\\\|[^\/])*?)\/(?<flags>[imslux]*)"#;
87+
const REGEXP: &str = r#"^\/(?!\/)(?<pattern>(\\\/|\\\\|[^\/])*?)\/(?<flags>[imslux]*)"#;
8888
const REGEXP_RE: Lazy<Regex> = Lazy::new(|| Regex::new(REGEXP).unwrap());
89-
const OP: &str = r"^[+\*\?]";
89+
const OP: &str = r"^[+\*]|[?](?![a-z])";
9090
const OP_RE: Lazy<Regex> = Lazy::new(|| Regex::new(OP).unwrap());
9191
const VBAR: &str = r"^((\r?\n)+\s*)?\|";
9292
const VBAR_RE: Lazy<Regex> = Lazy::new(|| Regex::new(VBAR).unwrap());
@@ -138,9 +138,15 @@ impl EBNFParser {
138138
// Advance to next non-whitespace (or comment) in the input.
139139
self.consume_space();
140140
// item: rule | token
141-
if RULE_RE.is_match(&self.input_string[self.cur_pos..]) {
141+
if RULE_RE
142+
.is_match(&self.input_string[self.cur_pos..])
143+
.unwrap()
144+
{
142145
self.parse_rule();
143-
} else if TOKEN_RE.is_match(&self.input_string[self.cur_pos..]) {
146+
} else if TOKEN_RE
147+
.is_match(&self.input_string[self.cur_pos..])
148+
.unwrap()
149+
{
144150
self.parse_token();
145151
} else if self.peek(0) == Some('%') {
146152
self.parse_statement();
@@ -174,8 +180,10 @@ impl EBNFParser {
174180
// We know there's a match because this method is only called after
175181
// we've already determined that the next thing is a rule.
176182
// Remove the leading ? or !, if any, because we don't care about it.
183+
self.consume_space();
177184
let rule_match = RULE_RE
178185
.captures(&self.input_string[self.cur_pos..])
186+
.unwrap()
179187
.unwrap();
180188
// eprintln!("{:?}", rule_match);
181189
self.cur_parsing = Item::RULE;
@@ -210,10 +218,14 @@ impl EBNFParser {
210218
/// Note that the params that are present in the syntax have already been
211219
/// expanded by the time this procedure is called.
212220
fn parse_token(&mut self) {
213-
let token_match = TOKEN_RE.find(&self.input_string[self.cur_pos..]).unwrap();
221+
self.consume_space();
222+
let token_match = TOKEN_RE
223+
.find(&self.input_string[self.cur_pos..])
224+
.unwrap()
225+
.unwrap();
214226

215227
self.name_stack.push(token_match.as_str().into());
216-
self.consume(token_match.len());
228+
self.consume(token_match.as_str().len());
217229

218230
if self.peek(0) == Some('.') {
219231
self.parse_priority();
@@ -283,7 +295,7 @@ impl EBNFParser {
283295
"%ig" => {
284296
self.consume("%ignore".len());
285297
self.consume_space();
286-
let Some(token) = TOKEN_RE.find(&self.input_string[self.cur_pos..]) else {
298+
let Ok(Some(token)) = TOKEN_RE.find(&self.input_string[self.cur_pos..]) else {
287299
self.report_parse_error("Expected token after %ignore statement.");
288300
return;
289301
};
@@ -312,7 +324,7 @@ impl EBNFParser {
312324
self.report_parse_error("Expected '.'.");
313325
}
314326

315-
let Some(number_match) = NUMBER_RE.find(&self.input_string[self.cur_pos..]) else {
327+
let Ok(Some(number_match)) = NUMBER_RE.find(&self.input_string[self.cur_pos..]) else {
316328
// Add a return statement to make the compiler happy, even though
317329
// this call never returns.
318330
self.report_parse_error("Expected a number.");
@@ -322,7 +334,7 @@ impl EBNFParser {
322334
// Parsing as an integer *should* never fail, since we've just matched
323335
// something that must be a valid number.
324336
self.cur_priority = number_match.as_str().parse::<i32>().unwrap();
325-
self.consume(number_match.len());
337+
self.consume(number_match.as_str().len());
326338
}
327339

328340
/// Parse the expansions of the rule.
@@ -348,15 +360,18 @@ impl EBNFParser {
348360

349361
// If there's another alias, parse it.
350362
// self.consume_space();
351-
if VBAR_RE.is_match(&self.input_string[self.cur_pos..]) {
352-
// eprintln!("VBAR match");
353-
// Advance past the VBAR to the beginning of the next alias.
354-
self.consume_space();
355-
self.consume(1);
356-
continue;
357-
} else {
358-
// eprintln!("VBAR no match");
359-
break self.name_stack.last().unwrap().to_string();
363+
match VBAR_RE.is_match(&self.input_string[self.cur_pos..]) {
364+
Ok(true) => {
365+
// eprintln!("VBAR match");
366+
// Advance past the VBAR to the beginning of the next alias.
367+
self.consume_space();
368+
self.consume(1);
369+
continue;
370+
}
371+
_ => {
372+
// eprintln!("VBAR no match");
373+
break self.name_stack.last().unwrap().to_string();
374+
}
360375
}
361376
}
362377
}
@@ -391,7 +406,9 @@ impl EBNFParser {
391406
self.consume_space();
392407
res.push(self.parse_expression());
393408
self.consume_space();
394-
if VBAR_RE.is_match(&self.input_string[self.cur_pos..])
409+
if VBAR_RE
410+
.is_match(&self.input_string[self.cur_pos..])
411+
.unwrap()
395412
|| self.peek(0) == Some(')')
396413
|| self.peek(0) == Some(']')
397414
|| (self.peek(0) == Some('-') && self.peek(1) == Some('>'))
@@ -415,73 +432,70 @@ impl EBNFParser {
415432
let new_atom = self.parse_atom();
416433
// eprintln!("parse_expression: {}", new_atom);
417434
// eprintln!("cur_pos: {}", self.cur_pos);
418-
if OP_RE.is_match(&self.input_string[self.cur_pos..])
419-
&& !(self.peek(0) == Some('?')
420-
// Use a default value that will evaluate to false to avoid
421-
// panicking in the case where the question mark is the last
422-
// character of the input.
423-
&& self.peek(1).unwrap_or('!').is_ascii_lowercase())
424-
{
425-
// We don't get lookahead in this regex library, but the lark
426-
// grammar has the lookahead (?![a-z]) after the question mark
427-
// operator, which makes sure we don't treat the question mark at
428-
// the beginning of an identifier as a question mark
429-
// operator. Instead, we manually implement something like this lookahead.
430-
// TOOD: Switch to fancy-regex to support lookaround.
431-
// eprintln!("Match on OP_RE");
432-
let new_nonterm = self.new_nonterminal("expression");
433-
match self.peek(0).unwrap() {
434-
'*' => {
435-
// Convert every repetition E* to a fresh non-terminal X
436-
// and add X = $\epsilon$ | X E.
437-
self.grammar.productions.push(Production {
438-
lhs: new_nonterm.clone(),
439-
rhs: vec!["".to_string()],
440-
});
441-
self.grammar.productions.push(Production {
442-
lhs: new_nonterm.clone(),
443-
rhs: vec![new_nonterm.clone(), new_atom],
444-
});
445-
}
446-
'+' => {
447-
// Convert every at-least-one repetition E+ to a fresh
448-
// non-terminal X and add X = E | X E.
449-
self.grammar.productions.push(Production {
450-
lhs: new_nonterm.clone(),
451-
rhs: vec![new_atom.clone()],
452-
});
453-
self.grammar.productions.push(Production {
454-
lhs: new_nonterm.clone(),
455-
rhs: vec![new_nonterm.clone(), new_atom.clone()],
456-
});
457-
}
458-
'?' => {
459-
// Convert every option E? to a fresh non-terminal X and
460-
// add X = $\epsilon$ | E.
461-
self.grammar.productions.push(Production {
462-
lhs: new_nonterm.clone(),
463-
rhs: vec!["".to_string()],
464-
});
465-
self.grammar.productions.push(Production {
466-
lhs: new_nonterm.clone(),
467-
rhs: vec![new_atom.clone()],
468-
});
435+
match OP_RE.is_match(&self.input_string[self.cur_pos..]) {
436+
Ok(true) => {
437+
// We don't get lookahead in this regex library, but the lark
438+
// grammar has the lookahead (?![a-z]) after the question mark
439+
// operator, which makes sure we don't treat the question mark at
440+
// the beginning of an identifier as a question mark
441+
// operator. Instead, we manually implement something like this lookahead.
442+
// TOOD: Switch to fancy-regex to support lookaround.
443+
// eprintln!("Match on OP_RE");
444+
let new_nonterm = self.new_nonterminal("expression");
445+
match self.peek(0).unwrap() {
446+
'*' => {
447+
// Convert every repetition E* to a fresh non-terminal X
448+
// and add X = $\epsilon$ | X E.
449+
self.grammar.productions.push(Production {
450+
lhs: new_nonterm.clone(),
451+
rhs: vec!["".to_string()],
452+
});
453+
self.grammar.productions.push(Production {
454+
lhs: new_nonterm.clone(),
455+
rhs: vec![new_nonterm.clone(), new_atom],
456+
});
457+
}
458+
'+' => {
459+
// Convert every at-least-one repetition E+ to a fresh
460+
// non-terminal X and add X = E | X E.
461+
self.grammar.productions.push(Production {
462+
lhs: new_nonterm.clone(),
463+
rhs: vec![new_atom.clone()],
464+
});
465+
self.grammar.productions.push(Production {
466+
lhs: new_nonterm.clone(),
467+
rhs: vec![new_nonterm.clone(), new_atom.clone()],
468+
});
469+
}
470+
'?' => {
471+
// Convert every option E? to a fresh non-terminal X and
472+
// add X = $\epsilon$ | E.
473+
self.grammar.productions.push(Production {
474+
lhs: new_nonterm.clone(),
475+
rhs: vec!["".to_string()],
476+
});
477+
self.grammar.productions.push(Production {
478+
lhs: new_nonterm.clone(),
479+
rhs: vec![new_atom.clone()],
480+
});
481+
}
482+
_ => { /* We should never reach this because we already matched above. */ }
469483
}
470-
_ => { /* We should never reach this because we already matched above. */ }
484+
self.consume(1);
485+
self.grammar.symbol_set.push(new_nonterm.clone());
486+
return new_nonterm;
487+
// TODO: Add support for range repeats.
488+
// } else if self.peek(0) == Some('~') {
489+
// // The following is a range of some kind.
490+
// self.consume(1);
491+
// self.consume_space();
492+
// // self.handle_range();
493+
}
494+
_ => {
495+
// eprintln!("No match on OP_RE");
496+
// An unquantified atom.
497+
new_atom
471498
}
472-
self.consume(1);
473-
self.grammar.symbol_set.push(new_nonterm.clone());
474-
return new_nonterm;
475-
// TODO: Add support for range repeats.
476-
// } else if self.peek(0) == Some('~') {
477-
// // The following is a range of some kind.
478-
// self.consume(1);
479-
// self.consume_space();
480-
// // self.handle_range();
481-
} else {
482-
// eprintln!("No match on OP_RE");
483-
// An unquantified atom.
484-
new_atom
485499
}
486500
}
487501

@@ -553,7 +567,7 @@ impl EBNFParser {
553567
let input_string = self.input_string.clone();
554568
// self.consume_space();
555569

556-
let Some(matched_string) = STRING_RE.captures(&input_string[self.cur_pos..]) else {
570+
let Ok(Some(matched_string)) = STRING_RE.captures(&input_string[self.cur_pos..]) else {
557571
return self.report_parse_error("String ill-formed.");
558572
};
559573

@@ -568,7 +582,7 @@ impl EBNFParser {
568582
if self.peek(0) == Some('.') && self.peek(1) == Some('.') {
569583
// Literal range!
570584
self.consume(2);
571-
let Some(second_matched_string) = STRING_RE.captures(&input_string[self.cur_pos..])
585+
let Ok(Some(second_matched_string)) = STRING_RE.captures(&input_string[self.cur_pos..])
572586
else {
573587
return self.report_parse_error("String ill-formed.");
574588
};
@@ -614,16 +628,27 @@ impl EBNFParser {
614628
/// `name: RULE | TOKEN`
615629
fn parse_name(&mut self) -> String {
616630
let input_string = self.input_string.clone();
617-
if RULE_RE.is_match(&self.input_string[self.cur_pos..]) {
618-
let rule_match = RULE_RE.find(&input_string[self.cur_pos..]).unwrap();
619-
self.consume(rule_match.len());
631+
self.consume_space();
632+
if RULE_RE
633+
.is_match(&self.input_string[self.cur_pos..])
634+
.unwrap()
635+
{
636+
let Some(rule_match) = RULE_RE.find(&input_string[self.cur_pos..]).unwrap() else {
637+
panic!()
638+
};
639+
self.consume(rule_match.as_str().len());
620640
self.grammar
621641
.symbol_set
622642
.push(rule_match.as_str().to_string());
623643
return rule_match.as_str().into();
624-
} else if TOKEN_RE.is_match(&self.input_string[self.cur_pos..]) {
625-
let token_match = TOKEN_RE.find(&input_string[self.cur_pos..]).unwrap();
626-
self.consume(token_match.len());
644+
} else if TOKEN_RE
645+
.is_match(&self.input_string[self.cur_pos..])
646+
.unwrap()
647+
{
648+
let Some(token_match) = TOKEN_RE.find(&input_string[self.cur_pos..]).unwrap() else {
649+
panic!()
650+
};
651+
self.consume(token_match.as_str().len());
627652
self.grammar
628653
.symbol_set
629654
.push(token_match.as_str().to_string());
@@ -646,7 +671,7 @@ impl EBNFParser {
646671
/// grammar and treat them as such.
647672
fn parse_regex(&mut self) -> String {
648673
let input_string = self.input_string.clone();
649-
let Some(re_match) = REGEXP_RE.captures(&input_string[self.cur_pos..]) else {
674+
let Ok(Some(re_match)) = REGEXP_RE.captures(&input_string[self.cur_pos..]) else {
650675
self.report_parse_error("Failed to parse regular expression.");
651676
return "".into();
652677
};

0 commit comments

Comments
 (0)