@@ -30,7 +30,10 @@ pub enum LexError {
3030impl Lexer {
3131 /// Construct a new lexer that recognizes the given `terminals` and ignores
3232 /// the `ignore_types`.
33- pub fn new ( terminals : & Vec < Terminal > ) -> Result < Self , LexError > {
33+ pub fn new (
34+ terminals : & Vec < Terminal > ,
35+ ignore_terminals : & Vec < String > ,
36+ ) -> Result < Self , LexError > {
3437 // Determine which patterns might contain newlines
3538 let mut newline_types: HashSet < Terminal > = HashSet :: new ( ) ;
3639 let mut index_to_type: HashMap < usize , Terminal > = HashMap :: new ( ) ;
@@ -79,6 +82,7 @@ impl Lexer {
7982
8083 Ok ( Lexer {
8184 terminals : terminals. to_vec ( ) ,
85+ ignore_terminals : ignore_terminals. to_vec ( ) ,
8286 newline_types,
8387 dfa,
8488 index_to_type,
@@ -174,6 +178,32 @@ impl Lexer {
174178 loop {
175179 // Try to match next token
176180 if let Some ( ( value, terminal) ) = self . match_token ( text. into ( ) , pos) {
181+ let ignored = self . ignore_terminals . contains ( & terminal. name ) ;
182+
183+ // If this token is ignored, update position and continue the loop
184+ if ignored {
185+ let contains_newline = self . newline_types . contains ( terminal) ;
186+
187+ // Update line and column information
188+ if contains_newline {
189+ // Calculate new line and column for tokens with newlines
190+ for & b in & value {
191+ if b == b'\n' {
192+ line += 1 ;
193+ column = 1 ;
194+ } else {
195+ column += 1 ;
196+ }
197+ }
198+ } else {
199+ column += value. len ( ) ;
200+ }
201+
202+ // Move position forward and continue the loop
203+ pos += value. len ( ) ;
204+ continue ;
205+ }
206+
177207 // For non-ignored tokens, create and return the token
178208 let start_pos = pos;
179209 let end_pos = start_pos + value. len ( ) ;
@@ -371,7 +401,7 @@ mod tests {
371401 fn lexer_initialization ( ) {
372402 let terminal_defs = vec ! [ word( ) , space( ) ] ;
373403
374- let Ok ( lexer) = Lexer :: new ( & terminal_defs) else {
404+ let Ok ( lexer) = Lexer :: new ( & terminal_defs, & vec ! [ ] ) else {
375405 panic ! ( )
376406 } ;
377407
@@ -384,31 +414,31 @@ mod tests {
384414 let terminal_defs = vec ! [ word( ) , space( ) ] ;
385415
386416 // Initialize the lexer
387- let Ok ( lexer) = Lexer :: new ( & terminal_defs) else {
417+ let Ok ( lexer) = Lexer :: new ( & terminal_defs, & vec ! [ "SPACE" . to_string ( ) ] ) else {
388418 panic ! ( )
389419 } ;
390420
391421 // Lex a simple text
392422 let tokens = lexer. lex ( "hello world" . as_bytes ( ) ) . unwrap ( ) ;
393423
394- // Should have 3 tokens: "hello", " ", and "world"
395- // (plus one EOF marker)
396- assert_eq ! ( tokens. 0 . len( ) , 3 ) ;
424+ // Should have 2 tokens: "hello", and "world".
425+ assert_eq ! ( tokens. 0 . len( ) , 2 ) ;
397426
398427 assert_eq ! ( & * tokens. 0 [ 0 ] . value, "hello" . as_bytes( ) ) ;
399428 assert_eq ! ( tokens. 0 [ 0 ] . terminal, Some ( word( ) ) ) ;
400429
401- assert_eq ! ( & * tokens. 0 [ 2 ] . value, "world" . as_bytes( ) ) ;
402- assert_eq ! ( tokens. 0 [ 2 ] . terminal, Some ( word( ) ) ) ;
430+ assert_eq ! ( & * tokens. 0 [ 1 ] . value, "world" . as_bytes( ) ) ;
431+ assert_eq ! ( tokens. 0 [ 1 ] . terminal, Some ( word( ) ) ) ;
403432
404433 // The remainder should be the last token in the input.
405- assert_eq ! ( tokens. 0 [ 2 ] , tokens. 1 ) ;
434+ assert_eq ! ( tokens. 0 [ 1 ] , tokens. 1 ) ;
406435 }
407436
408437 #[ test]
409438 fn expression ( ) {
410439 let terminals = vec ! [ word( ) , star( ) , dec_number( ) , plus( ) , space( ) ] ;
411- let Ok ( lexer) = Lexer :: new ( & terminals) else {
440+ let ignore_terminals = vec ! [ "SPACE" . to_string( ) ] ;
441+ let Ok ( lexer) = Lexer :: new ( & terminals, & ignore_terminals) else {
412442 panic ! ( )
413443 } ;
414444
@@ -434,9 +464,10 @@ mod tests {
434464
435465 #[ test]
436466 fn complex_string_literals ( ) {
437- let terminal_defs = vec ! [ string( ) , word( ) , equals( ) , dot( ) , space( ) ] ;
467+ let terminals = vec ! [ string( ) , word( ) , equals( ) , dot( ) , space( ) ] ;
468+ let ignore_terminals = vec ! [ "SPACE" . to_string( ) ] ;
438469
439- let Ok ( lexer) = Lexer :: new ( & terminal_defs ) else {
470+ let Ok ( lexer) = Lexer :: new ( & terminals , & ignore_terminals ) else {
440471 panic ! ( )
441472 } ;
442473
@@ -451,15 +482,12 @@ mod tests {
451482 . map ( |token| token. terminal . clone ( ) . unwrap ( ) )
452483 . collect ( ) ;
453484
454- assert_eq ! (
455- token_types,
456- vec![ word( ) , space( ) , equals( ) , space( ) , string( ) , dot( ) ]
457- ) ;
485+ assert_eq ! ( token_types, vec![ word( ) , equals( ) , string( ) , dot( ) ] ) ;
458486 }
459487
460488 #[ test]
461489 fn numeric_literals ( ) {
462- let terminal_defs = vec ! [
490+ let terminals = vec ! [
463491 float_number( ) ,
464492 hex_number( ) ,
465493 oct_number( ) ,
@@ -470,16 +498,17 @@ mod tests {
470498 semicolon( ) ,
471499 space( ) ,
472500 ] ;
501+ let ignore_terminals = vec ! [ "SPACE" . to_string( ) ] ;
473502
474503 // Test cases for numeric literals
475504 let test_cases = vec ! [
476505 (
477506 "x = 42;" ,
478507 vec![
479508 ( word( ) , "x" . as_bytes( ) ) ,
480- ( space( ) , " " . as_bytes( ) ) ,
509+ // (space(), " ".as_bytes()),
481510 ( equals( ) , "=" . as_bytes( ) ) ,
482- ( space( ) , " " . as_bytes( ) ) ,
511+ // (space(), " ".as_bytes()),
483512 ( dec_number( ) , "42" . as_bytes( ) ) ,
484513 ( semicolon( ) , ";" . as_bytes( ) ) ,
485514 ] ,
@@ -488,9 +517,9 @@ mod tests {
488517 "hex = 0xFF;" ,
489518 vec![
490519 ( word( ) , "hex" . as_bytes( ) ) ,
491- ( space( ) , " " . as_bytes( ) ) ,
520+ // (space(), " ".as_bytes()),
492521 ( equals( ) , "=" . as_bytes( ) ) ,
493- ( space( ) , " " . as_bytes( ) ) ,
522+ // (space(), " ".as_bytes()),
494523 ( hex_number( ) , "0xFF" . as_bytes( ) ) ,
495524 ( semicolon( ) , ";" . as_bytes( ) ) ,
496525 ] ,
@@ -499,9 +528,9 @@ mod tests {
499528 "oct = 0o77;" ,
500529 vec![
501530 ( word( ) , "oct" . as_bytes( ) ) ,
502- ( space( ) , " " . as_bytes( ) ) ,
531+ // (space(), " ".as_bytes()),
503532 ( equals( ) , "=" . as_bytes( ) ) ,
504- ( space( ) , " " . as_bytes( ) ) ,
533+ // (space(), " ".as_bytes()),
505534 ( oct_number( ) , "0o77" . as_bytes( ) ) ,
506535 ( semicolon( ) , ";" . as_bytes( ) ) ,
507536 ] ,
@@ -510,9 +539,9 @@ mod tests {
510539 "bin = 0b1010;" ,
511540 vec![
512541 ( word( ) , "bin" . as_bytes( ) ) ,
513- ( space( ) , " " . as_bytes( ) ) ,
542+ // (space(), " ".as_bytes()),
514543 ( equals( ) , "=" . as_bytes( ) ) ,
515- ( space( ) , " " . as_bytes( ) ) ,
544+ // (space(), " ".as_bytes()),
516545 ( bin_number( ) , "0b1010" . as_bytes( ) ) ,
517546 ( semicolon( ) , ";" . as_bytes( ) ) ,
518547 ] ,
@@ -521,9 +550,9 @@ mod tests {
521550 "pi = 3.14159;" ,
522551 vec![
523552 ( word( ) , "pi" . as_bytes( ) ) ,
524- ( space( ) , " " . as_bytes( ) ) ,
553+ // (space(), " ".as_bytes()),
525554 ( equals( ) , "=" . as_bytes( ) ) ,
526- ( space( ) , " " . as_bytes( ) ) ,
555+ // (space(), " ".as_bytes()),
527556 ( float_number( ) , "3.14159" . as_bytes( ) ) ,
528557 ( semicolon( ) , ";" . as_bytes( ) ) ,
529558 ] ,
@@ -532,9 +561,9 @@ mod tests {
532561 "e = 2.71e-3;" ,
533562 vec![
534563 ( word( ) , "e" . as_bytes( ) ) ,
535- ( space( ) , " " . as_bytes( ) ) ,
564+ // (space(), " ".as_bytes()),
536565 ( equals( ) , "=" . as_bytes( ) ) ,
537- ( space( ) , " " . as_bytes( ) ) ,
566+ // (space(), " ".as_bytes()),
538567 ( float_number( ) , "2.71e-3" . as_bytes( ) ) ,
539568 ( semicolon( ) , ";" . as_bytes( ) ) ,
540569 ] ,
@@ -543,9 +572,9 @@ mod tests {
543572 "val = .5;" ,
544573 vec![
545574 ( word( ) , "val" . as_bytes( ) ) ,
546- ( space( ) , " " . as_bytes( ) ) ,
575+ // (space(), " ".as_bytes()),
547576 ( equals( ) , "=" . as_bytes( ) ) ,
548- ( space( ) , " " . as_bytes( ) ) ,
577+ // (space(), " ".as_bytes()),
549578 ( float_number( ) , ".5" . as_bytes( ) ) ,
550579 ( semicolon( ) , ";" . as_bytes( ) ) ,
551580 ] ,
@@ -554,16 +583,16 @@ mod tests {
554583 "sci = 6.022e23;" ,
555584 vec![
556585 ( word( ) , "sci" . as_bytes( ) ) ,
557- ( space( ) , " " . as_bytes( ) ) ,
586+ // (space(), " ".as_bytes()),
558587 ( equals( ) , "=" . as_bytes( ) ) ,
559- ( space( ) , " " . as_bytes( ) ) ,
588+ // (space(), " ".as_bytes()),
560589 ( float_number( ) , "6.022e23" . as_bytes( ) ) ,
561590 ( semicolon( ) , ";" . as_bytes( ) ) ,
562591 ] ,
563592 ) ,
564593 ] ;
565594
566- let Ok ( lexer) = Lexer :: new ( & terminal_defs ) else {
595+ let Ok ( lexer) = Lexer :: new ( & terminals , & ignore_terminals ) else {
567596 panic ! ( )
568597 } ;
569598
@@ -588,16 +617,17 @@ mod tests {
588617 // lexical terminal (because that could change its type with future
589618 // additions).
590619 let terminals = vec ! [ word( ) , dec_number( ) , space( ) ] ;
620+ let ignore_terminals = vec ! [ "SPACE" . to_string( ) ] ;
591621
592- let Ok ( lexer) = Lexer :: new ( & terminals) else {
622+ let Ok ( lexer) = Lexer :: new ( & terminals, & ignore_terminals ) else {
593623 panic ! ( )
594624 } ;
595625
596626 let text = "123 ret" . as_bytes ( ) ;
597627 let ( tokens, remainder) = lexer. lex ( text) . unwrap ( ) ;
598628
599629 // We expect:
600- // tokens: [123, space, ret]
630+ // tokens: [123, ret]
601631 // remainder: ret
602632 assert_eq ! (
603633 tokens[ 0 ] ,
@@ -614,7 +644,7 @@ mod tests {
614644 ) ;
615645
616646 assert_eq ! (
617- tokens[ 2 ] ,
647+ tokens[ 1 ] ,
618648 Token {
619649 value: "ret" . as_bytes( ) . into( ) ,
620650 terminal: Some ( word( ) ) ,
@@ -627,16 +657,17 @@ mod tests {
627657 }
628658 ) ;
629659
630- assert_eq ! ( tokens[ 2 ] , remainder) ;
660+ assert_eq ! ( tokens[ 1 ] , remainder) ;
631661 }
632662
633663 #[ test]
634664 fn remainder_is_not_lexical_token ( ) {
635665 // In the case where the string could not be lexed all the way to the
636666 // end, the remainder is unlexed suffix.
637667 let terminals = vec ! [ word( ) , hex_number( ) , space( ) ] ;
668+ let ignore_terminals = vec ! [ "SPACE" . to_string( ) ] ;
638669
639- let Ok ( lexer) = Lexer :: new ( & terminals) else {
670+ let Ok ( lexer) = Lexer :: new ( & terminals, & ignore_terminals ) else {
640671 panic ! ( )
641672 } ;
642673
@@ -677,9 +708,10 @@ mod tests {
677708
678709 #[ test]
679710 fn multiline_tracking ( ) {
680- let terminal_defs = vec ! [ word( ) , newline( ) , space( ) ] ;
711+ let terminals = vec ! [ word( ) , newline( ) , space( ) ] ;
712+ let ignore_terminals = vec ! [ "SPACE" . to_string( ) ] ;
681713
682- let Ok ( lexer) = Lexer :: new ( & terminal_defs ) else {
714+ let Ok ( lexer) = Lexer :: new ( & terminals , & ignore_terminals ) else {
683715 panic ! ( )
684716 } ;
685717
0 commit comments