Skip to content

Commit 39a1914

Browse files
committed
Reintroduce ignore terminals to the lexer.
These are surprisingly slippery little buggers.
1 parent b6b1670 commit 39a1914

File tree

1 file changed

+73
-41
lines changed

1 file changed

+73
-41
lines changed

src/lexer.rs

Lines changed: 73 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,10 @@ pub enum LexError {
3030
impl Lexer {
3131
/// Construct a new lexer that recognizes the given `terminals` and ignores
3232
/// the `ignore_types`.
33-
pub fn new(terminals: &Vec<Terminal>) -> Result<Self, LexError> {
33+
pub fn new(
34+
terminals: &Vec<Terminal>,
35+
ignore_terminals: &Vec<String>,
36+
) -> Result<Self, LexError> {
3437
// Determine which patterns might contain newlines
3538
let mut newline_types: HashSet<Terminal> = HashSet::new();
3639
let mut index_to_type: HashMap<usize, Terminal> = HashMap::new();
@@ -79,6 +82,7 @@ impl Lexer {
7982

8083
Ok(Lexer {
8184
terminals: terminals.to_vec(),
85+
ignore_terminals: ignore_terminals.to_vec(),
8286
newline_types,
8387
dfa,
8488
index_to_type,
@@ -174,6 +178,32 @@ impl Lexer {
174178
loop {
175179
// Try to match next token
176180
if let Some((value, terminal)) = self.match_token(text.into(), pos) {
181+
let ignored = self.ignore_terminals.contains(&terminal.name);
182+
183+
// If this token is ignored, update position and continue the loop
184+
if ignored {
185+
let contains_newline = self.newline_types.contains(terminal);
186+
187+
// Update line and column information
188+
if contains_newline {
189+
// Calculate new line and column for tokens with newlines
190+
for &b in &value {
191+
if b == b'\n' {
192+
line += 1;
193+
column = 1;
194+
} else {
195+
column += 1;
196+
}
197+
}
198+
} else {
199+
column += value.len();
200+
}
201+
202+
// Move position forward and continue the loop
203+
pos += value.len();
204+
continue;
205+
}
206+
177207
// For non-ignored tokens, create and return the token
178208
let start_pos = pos;
179209
let end_pos = start_pos + value.len();
@@ -371,7 +401,7 @@ mod tests {
371401
fn lexer_initialization() {
372402
let terminal_defs = vec![word(), space()];
373403

374-
let Ok(lexer) = Lexer::new(&terminal_defs) else {
404+
let Ok(lexer) = Lexer::new(&terminal_defs, &vec![]) else {
375405
panic!()
376406
};
377407

@@ -384,31 +414,31 @@ mod tests {
384414
let terminal_defs = vec![word(), space()];
385415

386416
// Initialize the lexer
387-
let Ok(lexer) = Lexer::new(&terminal_defs) else {
417+
let Ok(lexer) = Lexer::new(&terminal_defs, &vec!["SPACE".to_string()]) else {
388418
panic!()
389419
};
390420

391421
// Lex a simple text
392422
let tokens = lexer.lex("hello world".as_bytes()).unwrap();
393423

394-
// Should have 3 tokens: "hello", " ", and "world"
395-
// (plus one EOF marker)
396-
assert_eq!(tokens.0.len(), 3);
424+
// Should have 2 tokens: "hello", and "world".
425+
assert_eq!(tokens.0.len(), 2);
397426

398427
assert_eq!(&*tokens.0[0].value, "hello".as_bytes());
399428
assert_eq!(tokens.0[0].terminal, Some(word()));
400429

401-
assert_eq!(&*tokens.0[2].value, "world".as_bytes());
402-
assert_eq!(tokens.0[2].terminal, Some(word()));
430+
assert_eq!(&*tokens.0[1].value, "world".as_bytes());
431+
assert_eq!(tokens.0[1].terminal, Some(word()));
403432

404433
// The remainder should be the last token in the input.
405-
assert_eq!(tokens.0[2], tokens.1);
434+
assert_eq!(tokens.0[1], tokens.1);
406435
}
407436

408437
#[test]
409438
fn expression() {
410439
let terminals = vec![word(), star(), dec_number(), plus(), space()];
411-
let Ok(lexer) = Lexer::new(&terminals) else {
440+
let ignore_terminals = vec!["SPACE".to_string()];
441+
let Ok(lexer) = Lexer::new(&terminals, &ignore_terminals) else {
412442
panic!()
413443
};
414444

@@ -434,9 +464,10 @@ mod tests {
434464

435465
#[test]
436466
fn complex_string_literals() {
437-
let terminal_defs = vec![string(), word(), equals(), dot(), space()];
467+
let terminals = vec![string(), word(), equals(), dot(), space()];
468+
let ignore_terminals = vec!["SPACE".to_string()];
438469

439-
let Ok(lexer) = Lexer::new(&terminal_defs) else {
470+
let Ok(lexer) = Lexer::new(&terminals, &ignore_terminals) else {
440471
panic!()
441472
};
442473

@@ -451,15 +482,12 @@ mod tests {
451482
.map(|token| token.terminal.clone().unwrap())
452483
.collect();
453484

454-
assert_eq!(
455-
token_types,
456-
vec![word(), space(), equals(), space(), string(), dot()]
457-
);
485+
assert_eq!(token_types, vec![word(), equals(), string(), dot()]);
458486
}
459487

460488
#[test]
461489
fn numeric_literals() {
462-
let terminal_defs = vec![
490+
let terminals = vec![
463491
float_number(),
464492
hex_number(),
465493
oct_number(),
@@ -470,16 +498,17 @@ mod tests {
470498
semicolon(),
471499
space(),
472500
];
501+
let ignore_terminals = vec!["SPACE".to_string()];
473502

474503
// Test cases for numeric literals
475504
let test_cases = vec![
476505
(
477506
"x = 42;",
478507
vec![
479508
(word(), "x".as_bytes()),
480-
(space(), " ".as_bytes()),
509+
// (space(), " ".as_bytes()),
481510
(equals(), "=".as_bytes()),
482-
(space(), " ".as_bytes()),
511+
// (space(), " ".as_bytes()),
483512
(dec_number(), "42".as_bytes()),
484513
(semicolon(), ";".as_bytes()),
485514
],
@@ -488,9 +517,9 @@ mod tests {
488517
"hex = 0xFF;",
489518
vec![
490519
(word(), "hex".as_bytes()),
491-
(space(), " ".as_bytes()),
520+
// (space(), " ".as_bytes()),
492521
(equals(), "=".as_bytes()),
493-
(space(), " ".as_bytes()),
522+
// (space(), " ".as_bytes()),
494523
(hex_number(), "0xFF".as_bytes()),
495524
(semicolon(), ";".as_bytes()),
496525
],
@@ -499,9 +528,9 @@ mod tests {
499528
"oct = 0o77;",
500529
vec![
501530
(word(), "oct".as_bytes()),
502-
(space(), " ".as_bytes()),
531+
// (space(), " ".as_bytes()),
503532
(equals(), "=".as_bytes()),
504-
(space(), " ".as_bytes()),
533+
// (space(), " ".as_bytes()),
505534
(oct_number(), "0o77".as_bytes()),
506535
(semicolon(), ";".as_bytes()),
507536
],
@@ -510,9 +539,9 @@ mod tests {
510539
"bin = 0b1010;",
511540
vec![
512541
(word(), "bin".as_bytes()),
513-
(space(), " ".as_bytes()),
542+
// (space(), " ".as_bytes()),
514543
(equals(), "=".as_bytes()),
515-
(space(), " ".as_bytes()),
544+
// (space(), " ".as_bytes()),
516545
(bin_number(), "0b1010".as_bytes()),
517546
(semicolon(), ";".as_bytes()),
518547
],
@@ -521,9 +550,9 @@ mod tests {
521550
"pi = 3.14159;",
522551
vec![
523552
(word(), "pi".as_bytes()),
524-
(space(), " ".as_bytes()),
553+
// (space(), " ".as_bytes()),
525554
(equals(), "=".as_bytes()),
526-
(space(), " ".as_bytes()),
555+
// (space(), " ".as_bytes()),
527556
(float_number(), "3.14159".as_bytes()),
528557
(semicolon(), ";".as_bytes()),
529558
],
@@ -532,9 +561,9 @@ mod tests {
532561
"e = 2.71e-3;",
533562
vec![
534563
(word(), "e".as_bytes()),
535-
(space(), " ".as_bytes()),
564+
// (space(), " ".as_bytes()),
536565
(equals(), "=".as_bytes()),
537-
(space(), " ".as_bytes()),
566+
// (space(), " ".as_bytes()),
538567
(float_number(), "2.71e-3".as_bytes()),
539568
(semicolon(), ";".as_bytes()),
540569
],
@@ -543,9 +572,9 @@ mod tests {
543572
"val = .5;",
544573
vec![
545574
(word(), "val".as_bytes()),
546-
(space(), " ".as_bytes()),
575+
// (space(), " ".as_bytes()),
547576
(equals(), "=".as_bytes()),
548-
(space(), " ".as_bytes()),
577+
// (space(), " ".as_bytes()),
549578
(float_number(), ".5".as_bytes()),
550579
(semicolon(), ";".as_bytes()),
551580
],
@@ -554,16 +583,16 @@ mod tests {
554583
"sci = 6.022e23;",
555584
vec![
556585
(word(), "sci".as_bytes()),
557-
(space(), " ".as_bytes()),
586+
// (space(), " ".as_bytes()),
558587
(equals(), "=".as_bytes()),
559-
(space(), " ".as_bytes()),
588+
// (space(), " ".as_bytes()),
560589
(float_number(), "6.022e23".as_bytes()),
561590
(semicolon(), ";".as_bytes()),
562591
],
563592
),
564593
];
565594

566-
let Ok(lexer) = Lexer::new(&terminal_defs) else {
595+
let Ok(lexer) = Lexer::new(&terminals, &ignore_terminals) else {
567596
panic!()
568597
};
569598

@@ -588,16 +617,17 @@ mod tests {
588617
// lexical terminal (because that could change its type with future
589618
// additions).
590619
let terminals = vec![word(), dec_number(), space()];
620+
let ignore_terminals = vec!["SPACE".to_string()];
591621

592-
let Ok(lexer) = Lexer::new(&terminals) else {
622+
let Ok(lexer) = Lexer::new(&terminals, &ignore_terminals) else {
593623
panic!()
594624
};
595625

596626
let text = "123 ret".as_bytes();
597627
let (tokens, remainder) = lexer.lex(text).unwrap();
598628

599629
// We expect:
600-
// tokens: [123, space, ret]
630+
// tokens: [123, ret]
601631
// remainder: ret
602632
assert_eq!(
603633
tokens[0],
@@ -614,7 +644,7 @@ mod tests {
614644
);
615645

616646
assert_eq!(
617-
tokens[2],
647+
tokens[1],
618648
Token {
619649
value: "ret".as_bytes().into(),
620650
terminal: Some(word()),
@@ -627,16 +657,17 @@ mod tests {
627657
}
628658
);
629659

630-
assert_eq!(tokens[2], remainder);
660+
assert_eq!(tokens[1], remainder);
631661
}
632662

633663
#[test]
634664
fn remainder_is_not_lexical_token() {
635665
// In the case where the string could not be lexed all the way to the
636666
// end, the remainder is unlexed suffix.
637667
let terminals = vec![word(), hex_number(), space()];
668+
let ignore_terminals = vec!["SPACE".to_string()];
638669

639-
let Ok(lexer) = Lexer::new(&terminals) else {
670+
let Ok(lexer) = Lexer::new(&terminals, &ignore_terminals) else {
640671
panic!()
641672
};
642673

@@ -677,9 +708,10 @@ mod tests {
677708

678709
#[test]
679710
fn multiline_tracking() {
680-
let terminal_defs = vec![word(), newline(), space()];
711+
let terminals = vec![word(), newline(), space()];
712+
let ignore_terminals = vec!["SPACE".to_string()];
681713

682-
let Ok(lexer) = Lexer::new(&terminal_defs) else {
714+
let Ok(lexer) = Lexer::new(&terminals, &ignore_terminals) else {
683715
panic!()
684716
};
685717

0 commit comments

Comments
 (0)