Skip to content

Commit aba7205

Browse files
committed
Pass a single integration tests.
1 parent 39a1914 commit aba7205

File tree

10 files changed

+137
-82
lines changed

10 files changed

+137
-82
lines changed

examples/construct_mask_store.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ fn main() -> Result<(), Box<dyn std::error::Error + Send + Sync>> {
2626
panic!()
2727
};
2828

29-
let Ok(lexer) = Lexer::new(&grammar.terminals) else {
29+
let Ok(lexer) = Lexer::new(&grammar.terminals, &grammar.ignore_terminals) else {
3030
panic!()
3131
};
3232

grammars/calc.lark

Lines changed: 0 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -12,15 +12,6 @@ start: expr
1212
| "(" expr ")"
1313

1414

15-
#DIGIT: "0".."9"
16-
#HEXDIGIT: "a".."f"|"A".."F"|DIGIT
17-
#INT: DIGIT+
18-
#SIGNED_INT: ["+"|"-"] INT
19-
#DECIMAL: INT "." INT? | "." INT
20-
#_EXP: ("e"|"E") SIGNED_INT
21-
#FLOAT: INT _EXP | DECIMAL _EXP?
22-
#NUMBER: FLOAT | INT
23-
2415
NUMBER: /(([0-9]+[eE][+-]?[0-9]+)|([0-9]\.[0-9]+|\.[0-9]+)([eE][+-]?[0-9]+)?)|[0-9]+/
2516

2617
WS: /[ \t\f\r\n]+/

src/dfa.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ use std::collections::VecDeque;
1515
/// method that returns an iterator over all states. The suggested alternative
1616
/// is to traverse the graph manually. See
1717
/// <https://github.com/rust-lang/regex/discussions/1223>.
18-
fn states(dfa: &dense::DFA<Vec<u32>>) -> Vec<StateID> {
18+
pub fn states(dfa: &dense::DFA<Vec<u32>>) -> Vec<StateID> {
1919
let mut queue: VecDeque<StateID> = VecDeque::new();
2020
let mut explored: Vec<StateID> = Vec::new();
2121

src/grammar.rs

Lines changed: 25 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -165,7 +165,7 @@ impl EBNFParser {
165165
self.grammar.symbol_set.dedup();
166166

167167
// Make ignore terminals ignored, if any. This is a non-trivial task.
168-
self.handle_ignore_terminals();
168+
// self.handle_ignore_terminals();
169169

170170
// Include the special EOF symbol.
171171
self.grammar.symbol_set.push("$".into());
@@ -264,6 +264,10 @@ impl EBNFParser {
264264
self.cur_priority,
265265
));
266266

267+
self.grammar
268+
.symbol_set
269+
.push(self.name_stack.last().unwrap().to_string());
270+
267271
// We are no longer parsing a token.
268272
self.name_stack.pop();
269273
self.cur_priority = 0;
@@ -300,7 +304,9 @@ impl EBNFParser {
300304
self.report_parse_error("Expected token after %ignore statement.");
301305
return;
302306
};
303-
self.ignore_terminals.push(token.as_str().to_string());
307+
self.grammar
308+
.ignore_terminals
309+
.push(token.as_str().to_string());
304310
self.consume(token.as_str().len());
305311
}
306312
"%de" => {
@@ -811,19 +817,23 @@ impl EBNFParser {
811817
/// but that is probably the most common use case.
812818
fn expand_extends(&mut self) {}
813819

814-
/// Insert the ignore terminals into each production.
815-
///
816-
/// We do this instead of passing the ignore terminals on to the lexer so
817-
/// that there aren't multiple kinds of terminals (lexer and parser) that
818-
/// we have to juggle when building a mask: the ignore information is
819-
/// already integrated directly into the grammar itself; the %ignore
820-
/// notation is just syntactic sugar for putting the named terminal at the
821-
/// beginning of each production and after each symbol in each production,
822-
/// except for productions that are already empty (to prevent infinite
823-
/// regression).
824-
fn handle_ignore_terminals(&mut self) {
825-
if !self.ignore_terminals.is_empty() {}
826-
}
820+
// /// Put the ignore terminals into the grammar as terminals.
821+
// ///
822+
// /// We do this at the end of parsing in case the ignore statement comes
823+
// /// before the definition of the terminal it references.
824+
// fn handle_ignore_terminals(&mut self) {
825+
// if !self.ignore_terminals.is_empty() {
826+
// for terminal_name in &self.ignore_terminals {
827+
// let Some(terminal) = self.grammar.terminal_from_name(&terminal_name) else {
828+
// self.report_parse_error(&*format!(
829+
// "The ignore terminal {terminal_name} is not defined"
830+
// ));
831+
// return;
832+
// };
833+
// self.grammar.ignore_terminals.push(terminal);
834+
// }
835+
// }
836+
// }
827837

828838
/// The symbols that "descend" from this one. A symbol descends
829839
/// from a symbol if it is on the right hand side of

src/mask.rs

Lines changed: 29 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -144,30 +144,38 @@ pub fn dfa_mask(
144144
pub fn dfa_mask_store(
145145
lexical_terminals: &Vec<Terminal>,
146146
model_vocabulary: &Vec<Vec<u8>>,
147-
parser: &Parser,
147+
_parser: &Parser,
148148
_length_of_terminal_sequences: usize,
149149
) -> DFAMaskStore {
150150
let all_states = all_dfa_states(lexical_terminals);
151151
let mut store: DFAMaskStore = HashMap::new();
152152
for (terminal, state_id) in &all_states {
153+
let dfa = &terminal.dfa;
153154
// For now, hard-code the lookahead of two terminals.
154155
// let next_terminals = parser.next_terminals(&terminal.name);
156+
// Lookahead of zero terminals.
157+
let accept_sequence_names = vec![];
158+
let accept_sequence_terminals = vec![];
159+
store.insert(
160+
(terminal.name.clone(), *state_id, accept_sequence_names),
161+
dfa_mask(dfa, state_id, &accept_sequence_terminals, &model_vocabulary),
162+
);
155163
for next_terminal in lexical_terminals {
164+
// Lookahead of one terminal.
165+
let accept_sequence_names = vec![next_terminal.name.clone()];
166+
let accept_sequence_terminals = vec![next_terminal.clone()];
167+
store.insert(
168+
(terminal.name.clone(), *state_id, accept_sequence_names),
169+
dfa_mask(dfa, state_id, &accept_sequence_terminals, &model_vocabulary),
170+
);
156171
// let after_next_terminals = parser.next_terminals(next_terminal);
157172
for after_next_terminal in lexical_terminals {
173+
// Lookahead of two terminals.
158174
let accept_sequence_names =
159175
vec![next_terminal.name.clone(), after_next_terminal.name.clone()];
160-
let accept_sequence_terminals = vec![
161-
parser
162-
.grammar
163-
.terminal_from_name(&next_terminal.name)
164-
.unwrap(),
165-
parser
166-
.grammar
167-
.terminal_from_name(&after_next_terminal.name)
168-
.unwrap(),
169-
];
170-
let dfa = &terminal.dfa;
176+
let accept_sequence_terminals =
177+
vec![next_terminal.clone(), after_next_terminal.clone()];
178+
171179
store.insert(
172180
(terminal.name.clone(), *state_id, accept_sequence_names),
173181
dfa_mask(dfa, state_id, &accept_sequence_terminals, &model_vocabulary),
@@ -202,11 +210,18 @@ pub fn grammar_mask(
202210
// Get the relevant mask out of the store.
203211
mask_store
204212
.get(&(
205-
first_terminal.name,
213+
first_terminal.name.clone(),
206214
end_state,
207215
accept_sequence[1..].to_vec(),
208216
))
209-
.unwrap(),
217+
.unwrap_or_else(|| {
218+
panic!(
219+
"The mask store does not contain the key ({}, {:?}, {:#?}).",
220+
first_terminal.name,
221+
end_state,
222+
accept_sequence[1..].to_vec()
223+
)
224+
}),
210225
)
211226
.enumerate()
212227
{

src/parser.rs

Lines changed: 37 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,10 @@ impl Parser {
2727

2828
/// Return all the terminals that could come after this one, regardless of
2929
/// the state the parser is in.
30+
///
31+
/// FIXME: This only works for shift actions and fails (stack underflow)
32+
/// for reduces. Reimplement this as a cleverer algorithm at the level of
33+
/// the grammar.
3034
pub fn next_terminals(&self, terminal: &String) -> Vec<String> {
3135
let states_that_accept_this_terminal: Vec<usize> = self
3236
.action_table
@@ -73,6 +77,10 @@ impl Parser {
7377
// consolidating the error-managing boiler plate.
7478
let mut state_stack = state_stack;
7579

80+
// if self.grammar.ignore_terminals.contains(terminal) {
81+
// // Consume the terminal without modifying the state stack.
82+
// }
83+
7684
loop {
7785
// Get the current state.
7886
let Some(state) = state_stack.last() else {
@@ -159,10 +167,20 @@ impl Parser {
159167
let mut a0: Vec<String> = Vec::new();
160168
let mut a1: Vec<String> = Vec::new();
161169

162-
let last_token = tokens[tokens.len() - 1].clone();
170+
let last_token = tokens.last().unwrap().clone();
163171
let mut state_stack = vec![self.start_state];
164172

165-
for token in &tokens[..] {
173+
// eprintln!("{:#?}", state_stack);
174+
for token in &tokens {
175+
// Skip ignore terminals.
176+
if self
177+
.grammar
178+
.ignore_terminals
179+
.contains(&token.clone().terminal.unwrap().name)
180+
{
181+
continue;
182+
}
183+
166184
let terminal = token.clone().terminal.unwrap().name.clone();
167185
// FIXME: There must be a less horrid way to do this.
168186
let Ok(new_state_stack) = self.next(&terminal, state_stack) else {
@@ -171,14 +189,15 @@ impl Parser {
171189
state_stack = new_state_stack;
172190
a0 = a1;
173191
a1 = self.follow(&state_stack);
192+
// eprintln!("{:#?}", state_stack);
174193
}
175194

176195
// There are two cases for accept sequences. See section 4.5 of the
177196
// paper and Algorithm 4, lines 15-21.
178197
let mut accept_sequences: HashSet<Vec<String>> = HashSet::new();
179198
if last_token == remainder {
180199
// Case 1: the remainder is the last lexical token.
181-
let remainder_type = remainder.clone().terminal.unwrap().name;
200+
let remainder_type = last_token.clone().terminal.unwrap().name;
182201
for terminal in a1 {
183202
accept_sequences.insert(vec![remainder_type.clone(), terminal]);
184203
}
@@ -191,6 +210,13 @@ impl Parser {
191210
accept_sequences.insert(vec![terminal]);
192211
}
193212
}
213+
214+
for ignore_terminal in &self.grammar.ignore_terminals {
215+
// We deal with ignore terminals by adding a special 1-long accept
216+
// sequence for each ignore terminal.
217+
accept_sequences.insert(vec![ignore_terminal.to_string()]);
218+
}
219+
194220
Ok(accept_sequences)
195221
}
196222
}
@@ -438,8 +464,6 @@ mod tests {
438464
action_table,
439465
goto_table,
440466
start_state: 0,
441-
// FIXME: Make this actually what it's supposed to be; for now,
442-
// just make the compiler happy.
443467
grammar: Grammar {
444468
symbol_set: vec![
445469
"goal".into(),
@@ -451,11 +475,12 @@ mod tests {
451475
"DEC_NUMBER".into(),
452476
"WORD".into(),
453477
"$".into(),
478+
"SPACE".into(),
454479
],
455-
terminals: vec![plus(), star(), dec_number(), word(), eof()],
480+
terminals: vec![plus(), star(), dec_number(), word(), eof(), space()],
456481
start_symbol: "goal".to_string(),
457482
productions: calc_rules(),
458-
ignore_terminals: vec![],
483+
ignore_terminals: vec!["SPACE".into()],
459484
},
460485
}
461486
}
@@ -502,7 +527,8 @@ mod tests {
502527
fn end_to_end_parse() {
503528
let parser = calc_parser();
504529
let terminals = vec![word(), star(), dec_number(), plus(), space()];
505-
let Ok(lexer) = Lexer::new(&terminals) else {
530+
let ignore_terminals = vec!["SPACE".to_string()];
531+
let Ok(lexer) = Lexer::new(&terminals, &ignore_terminals) else {
506532
panic!()
507533
};
508534

@@ -529,18 +555,11 @@ mod tests {
529555
},
530556
remainder
531557
);
532-
// It's not clear to me exactly what the semantics are here
533-
// w.r.t. whitespace and other ignored terminals. Also, how do we deal
534-
// with the possibilty that the remainder could change lexical type,
535-
// even to types that aren't permitted? As it is, the algorithm allows
536-
// a DEC_NUMBER to change to a WORD, even though that isn't actually
537-
// possible in the grammar: semantically, what this says is that
538-
// instead of being a DEC_NUMBER, the last lexical token could have
539-
// been a WORD, which is technically true. Nevertheless, we already
540-
// have enough information to know that the last token couldn't
541-
// possibly become a WORD and could only continue to be a DEC_NUMBER.
542558
assert_eq!(
543559
HashSet::from([
560+
vec!["DEC_NUMBER".to_string()],
561+
vec!["WORD".to_string()],
562+
vec!["SPACE".to_string()],
544563
vec!["DEC_NUMBER".to_string(), "STAR".to_string()],
545564
vec!["DEC_NUMBER".to_string(), "PLUS".to_string()],
546565
vec!["DEC_NUMBER".to_string(), "$".to_string()],

src/table.rs

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -342,12 +342,12 @@ fn checked_insert(state_id: usize, terminal: String, action: Action, table: &mut
342342
// As a special case, if the action we were going to insert is the same
343343
// as the one that's already there, we give a warning and continue.
344344
if table.get(&(state_id, terminal.clone())) == Some(&action) {
345-
eprintln!(
346-
"WARNING: While inserting an action into the action table, we found the same action we were going to insert.\nstate_id: {state_id}, terminal: {:#?}, existing action: {:#?}, new action: {:#?}",
347-
terminal.clone(),
348-
table.get(&(state_id, terminal.clone())),
349-
action
350-
);
345+
// eprintln!(
346+
// "WARNING: While inserting an action into the action table, we found the same action we were going to insert.\nstate_id: {state_id}, terminal: {:#?}, existing action: {:#?}, new action: {:#?}",
347+
// terminal.clone(),
348+
// table.get(&(state_id, terminal.clone())),
349+
// action
350+
// );
351351
} else {
352352
panic!(
353353
"While inserting an action into the action table, discovered a conflicting action:\nstate_id: {state_id}, terminal: {:#?}, existing action: {:#?}, new action: {:#?}",

src/types.rs

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -90,7 +90,7 @@ pub struct Grammar {
9090
/// The termnals that the lexer should ignore. FIXME: It's an aberration
9191
/// that this is here, because this field is not part of the abstract idea
9292
/// of what a grammar is.
93-
pub ignore_terminals: Vec<Terminal>,
93+
pub ignore_terminals: Vec<String>,
9494
}
9595

9696
/// An item of the item set for LR parsing.
@@ -145,6 +145,8 @@ pub struct Parser {
145145
pub struct Lexer {
146146
/// The terminals this lexer recognizes.
147147
pub terminals: Vec<Terminal>,
148+
/// The terminals this lexer ignores.
149+
pub ignore_terminals: Vec<String>,
148150
/// The terminals that contain newlines.
149151
pub newline_types: HashSet<Terminal>,
150152
/// The DFA for matching patterns.

tests/generation.rs

Lines changed: 15 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,6 @@
11
use bstr::ByteSlice;
2+
use regex_automata::dfa::Automaton;
3+
use syncode_core::dfa::{all_dfa_states, states};
24
// tests/generation.rs
35
use syncode_core::grammar::EBNFParser;
46
use syncode_core::mask::{dfa_mask_store, grammar_mask};
@@ -29,27 +31,34 @@ fn test_calc() {
2931
};
3032

3133
// eprintln!("action_table: {:#?}", parser.action_table);
34+
// eprintln!("goto_table: {:#?}", parser.goto_table);
3235

33-
let Ok(lexer) = Lexer::new(&grammar.terminals) else {
36+
let Ok(lexer) = Lexer::new(&grammar.terminals, &grammar.ignore_terminals) else {
3437
panic!()
3538
};
3639

3740
let mask_store = dfa_mask_store(&grammar.terminals, &model_vocabulary, &parser, 2);
3841

39-
let prefix = b"1+1";
42+
let prefix = b"1 + ";
4043

4144
let Ok((tokens, remainder)) = lexer.lex(prefix) else {
4245
panic!()
4346
};
4447

45-
// eprintln!("remainder: {}", remainder.value.as_bstr());
46-
// eprintln!("tokens: {:#?}", tokens);
48+
eprintln!("remainder: {:#?}", remainder);
49+
eprintln!("tokens: {:#?}", tokens);
4750

4851
let Ok(accept_sequences) = parser.parse(tokens, remainder.clone()) else {
4952
panic!()
5053
};
5154

52-
// eprintln!("{:#?}", accept_sequences);
55+
eprintln!("accept_sequences: {:#?}", accept_sequences);
56+
57+
// let number = grammar.terminal_from_name(&"NUMBER".to_string()).unwrap();
58+
59+
// let state = number.advance(number.start_state(), b"22");
60+
61+
// eprintln!("{:?}", states(&number.dfa));
5362

5463
let mask = grammar_mask(
5564
&accept_sequences,
@@ -104,7 +113,7 @@ fn test_dfa_mask_store() {
104113
panic!()
105114
};
106115
// println!("{:#?}", parser);
107-
let Ok(lexer) = Lexer::new(&grammar.terminals) else {
116+
let Ok(lexer) = Lexer::new(&grammar.terminals, &grammar.ignore_terminals) else {
108117
panic!()
109118
};
110119
let store = dfa_mask_store(&lexical_terminals, &model_vocabulary, &parser, 2);

0 commit comments

Comments
 (0)