From 987a74171e1759a96ae7e7ed2cf9df63c00d16b5 Mon Sep 17 00:00:00 2001 From: Keith Cirkel Date: Sun, 4 Feb 2024 13:19:55 +0000 Subject: [PATCH] embed trivia skipping in lexer --- Cargo.lock | 11 +++++ Cargo.toml | 1 + crates/hdx_lexer/Cargo.toml | 1 + crates/hdx_lexer/src/lib.rs | 30 +++++++++++++- crates/hdx_lexer/src/private.rs | 41 +++++++++++++------ crates/hdx_lexer/tests/tests.rs | 71 ++++++++++++++++++++++----------- 6 files changed, 117 insertions(+), 38 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index ef5e1aa4..02e0b69c 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -104,6 +104,16 @@ version = "2.3.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "630be753d4e58660abd17930c71b647fe46c27ea6b63cc59e1e3851406972e42" +[[package]] +name = "bitmask-enum" +version = "2.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9990737a6d5740ff51cdbbc0f0503015cb30c390f6623968281eb214a520cfc0" +dependencies = [ + "quote", + "syn", +] + [[package]] name = "bstr" version = "1.6.0" @@ -607,6 +617,7 @@ dependencies = [ name = "hdx_lexer" version = "0.0.1" dependencies = [ + "bitmask-enum", "bumpalo", "hdx_atom", "hdx_syntax", diff --git a/Cargo.toml b/Cargo.toml index 209ce215..02847a17 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -35,6 +35,7 @@ serde = { version = "1.0.171" } serde_json = { version = "1.0.102" } syn = { version = "2.0.26" } quote = { version = "1.0.31" } +bitmask-enum = { version = "2.2.1" } glob = { version = "0.3.1" } pico-args = { version = "0.5.0" } diff --git a/crates/hdx_lexer/Cargo.toml b/crates/hdx_lexer/Cargo.toml index bbe23943..bc974819 100644 --- a/crates/hdx_lexer/Cargo.toml +++ b/crates/hdx_lexer/Cargo.toml @@ -16,6 +16,7 @@ hdx_atom.workspace = true # Use OXC Allocator until https://github.com/fitzgen/bumpalo/pull/210 is resolved oxc_allocator = { workspace = true } bumpalo = { workspace = true, features = ["boxed", "collections"] } +bitmask-enum = { workspace = true } miette = { workspace = true } serde = { workspace = true, features = ["derive"], optional = true } diff --git a/crates/hdx_lexer/src/lib.rs b/crates/hdx_lexer/src/lib.rs index 8d8e4ddc..dd7d8de1 100644 --- a/crates/hdx_lexer/src/lib.rs +++ b/crates/hdx_lexer/src/lib.rs @@ -5,6 +5,7 @@ mod token; use std::{collections::VecDeque, str::Chars}; +use bitmask_enum::bitmask; use oxc_allocator::Allocator; pub use token::{NumType, PairWise, Token}; @@ -15,18 +16,31 @@ pub struct LexerCheckpoint<'a> { prev_pos: u32, } +#[bitmask(u8)] +pub(crate) enum Include { + Whitespace = 0b0001, + Comments = 0b0010, +} + pub struct Lexer<'a> { allocator: &'a Allocator, source: &'a str, current: LexerCheckpoint<'a>, lookahead: VecDeque>, + include: Include, } impl<'a> Lexer<'a> { pub fn new(allocator: &'a Allocator, source: &'a str) -> Self { let token = Token::default(); let current = LexerCheckpoint { chars: source.chars(), token, prev_pos: 0 }; - Self { allocator, source, current, lookahead: VecDeque::with_capacity(4) } + Self { + allocator, + source, + current, + lookahead: VecDeque::with_capacity(4), + include: Include::none(), + } } /// Remaining string from `Chars` @@ -108,4 +122,18 @@ impl<'a> Lexer<'a> { } self.read_next_token() } + + pub fn next_including_whitespace(&mut self) -> Token { + self.include = Include::Whitespace; + let token = self.read_next_token(); + self.include = Include::none(); + token + } + + pub fn next_including_whitespace_and_comments(&mut self) -> Token { + self.include = Include::all(); + let token = self.read_next_token(); + self.include = Include::none(); + token + } } diff --git a/crates/hdx_lexer/src/private.rs b/crates/hdx_lexer/src/private.rs index cd068a84..17fd318a 100644 --- a/crates/hdx_lexer/src/private.rs +++ b/crates/hdx_lexer/src/private.rs @@ -10,10 +10,20 @@ use crate::{ constants::{SINGLE_CHAR_TOKENS, SURROGATE_RANGE}, string_builder::AutoCow, token::{NumType, Token}, - Lexer, + Include, Lexer, }; impl<'a> Lexer<'a> { + #[inline] + fn include_whitspace(&self) -> bool { + self.include & Include::Whitespace == Include::Whitespace + } + + #[inline] + fn include_comments(&self) -> bool { + self.include & Include::Comments == Include::Comments + } + #[inline] fn nth(&self, n: usize) -> char { self.current.chars.clone().nth(n).unwrap_or(EOF) @@ -41,7 +51,13 @@ impl<'a> Lexer<'a> { } match c { // Whitespace Range - c if is_whitespace(c) => self.consume_whitespace(), + c if is_whitespace(c) => { + self.consume_whitespace(); + if self.include_whitspace() { + return Token::Whitespace; + } + self.read_next_token() + } // Quote Range c if is_quote(c) => self.consume_string_token(), // Digit Range @@ -110,7 +126,11 @@ impl<'a> Lexer<'a> { '*' => { self.current.chars.next(); self.current.chars.next(); - self.consume_comment_token() + self.consume_comment(); + if self.include_comments() { + return Token::Comment; + } + self.read_next_token() } _ => Token::Delim(self.current.chars.next().unwrap()), }, @@ -121,13 +141,9 @@ impl<'a> Lexer<'a> { } } - fn consume_whitespace(&mut self) -> Token { - loop { - if is_whitespace(self.nth(0)) { - self.current.chars.next(); - } else { - return Token::Whitespace; - } + fn consume_whitespace(&mut self) { + while is_whitespace(self.nth(0)) { + self.current.chars.next(); } } @@ -353,14 +369,13 @@ impl<'a> Lexer<'a> { } } - fn consume_comment_token(&mut self) -> Token { + fn consume_comment(&mut self) { while let Some(c) = self.current.chars.next() { if c == '*' && self.nth(0) == '/' { self.current.chars.next(); - return Token::Comment; + return; } } - Token::Comment } fn is_number_start(&mut self) -> bool { diff --git a/crates/hdx_lexer/tests/tests.rs b/crates/hdx_lexer/tests/tests.rs index 1b3df580..c1520165 100644 --- a/crates/hdx_lexer/tests/tests.rs +++ b/crates/hdx_lexer/tests/tests.rs @@ -12,9 +12,9 @@ fn empty() { let allocator = Allocator::default(); let mut lex = Lexer::new(&allocator, ""); assert_eq!(lex.pos(), 0); - assert_eq!(lex.next_token(), Token::Eof); + assert_eq!(lex.next_including_whitespace_and_comments(), Token::Eof); assert_eq!(lex.pos(), 0); - assert_eq!(lex.next_token(), Token::Eof); + assert_eq!(lex.next_including_whitespace_and_comments(), Token::Eof); assert_eq!(lex.pos(), 0); } @@ -23,11 +23,11 @@ fn tokenizes_tilde_as_ddelim() { let allocator = Allocator::default(); let mut lex = Lexer::new(&allocator, "~"); assert_eq!(lex.pos(), 0); - assert_eq!(lex.next_token(), Token::Delim('~')); + assert_eq!(lex.next_including_whitespace_and_comments(), Token::Delim('~')); assert_eq!(lex.pos(), 1); - assert_eq!(lex.next_token(), Token::Eof); + assert_eq!(lex.next_including_whitespace_and_comments(), Token::Eof); assert_eq!(lex.pos(), 1); - assert_eq!(lex.next_token(), Token::Eof); + assert_eq!(lex.next_including_whitespace_and_comments(), Token::Eof); assert_eq!(lex.pos(), 1); } @@ -36,11 +36,11 @@ fn tokenizes_newlines_as_whitespace() { let allocator = Allocator::default(); let mut lex = Lexer::new(&allocator, "\r\n"); assert_eq!(lex.pos(), 0); - assert_eq!(lex.next_token(), Token::Whitespace); + assert_eq!(lex.next_including_whitespace_and_comments(), Token::Whitespace); assert_eq!(lex.pos(), 2); - assert_eq!(lex.next_token(), Token::Eof); + assert_eq!(lex.next_including_whitespace_and_comments(), Token::Eof); assert_eq!(lex.pos(), 2); - assert_eq!(lex.next_token(), Token::Eof); + assert_eq!(lex.next_including_whitespace_and_comments(), Token::Eof); assert_eq!(lex.pos(), 2); } @@ -49,11 +49,11 @@ fn tokenizes_multiple_newlines_as_whitespace() { let allocator = Allocator::default(); let mut lex = Lexer::new(&allocator, "\r\n"); assert_eq!(lex.pos(), 0); - assert_eq!(lex.next_token(), Token::Whitespace); + assert_eq!(lex.next_including_whitespace_and_comments(), Token::Whitespace); assert_eq!(lex.pos(), 2); - assert_eq!(lex.next_token(), Token::Eof); + assert_eq!(lex.next_including_whitespace_and_comments(), Token::Eof); assert_eq!(lex.pos(), 2); - assert_eq!(lex.next_token(), Token::Eof); + assert_eq!(lex.next_including_whitespace_and_comments(), Token::Eof); assert_eq!(lex.pos(), 2); } @@ -62,39 +62,62 @@ fn tokenizes_multiple_whitespace_as_whitespace() { let allocator = Allocator::default(); let mut lex = Lexer::new(&allocator, "\t \t \t"); assert_eq!(lex.pos(), 0); - assert_eq!(lex.next_token(), Token::Whitespace); + assert_eq!(lex.next_including_whitespace_and_comments(), Token::Whitespace); assert_eq!(lex.pos(), 5); - assert_eq!(lex.next_token(), Token::Eof); + assert_eq!(lex.next_including_whitespace_and_comments(), Token::Eof); assert_eq!(lex.pos(), 5); - assert_eq!(lex.next_token(), Token::Eof); + assert_eq!(lex.next_including_whitespace_and_comments(), Token::Eof); assert_eq!(lex.pos(), 5); } #[test] fn tokenizes_trivial_css_file() { let allocator = Allocator::default(); - let mut lex = Lexer::new(&allocator, "body { color: black }"); + let mut lex = Lexer::new(&allocator, "body { color: black }/* fin */"); assert_eq!(lex.pos(), 0); - assert_eq!(lex.next_token(), Token::Ident(atom!("body"))); + assert_eq!(lex.next_including_whitespace_and_comments(), Token::Ident(atom!("body"))); assert_eq!(lex.pos(), 4); - assert_eq!(lex.next_token(), Token::Whitespace); + assert_eq!(lex.next_including_whitespace_and_comments(), Token::Whitespace); assert_eq!(lex.pos(), 5); - assert_eq!(lex.next_token(), Token::LeftCurly); + assert_eq!(lex.next_including_whitespace_and_comments(), Token::LeftCurly); assert_eq!(lex.pos(), 6); - assert_eq!(lex.next_token(), Token::Whitespace); + assert_eq!(lex.next_including_whitespace_and_comments(), Token::Whitespace); assert_eq!(lex.pos(), 7); + assert_eq!(lex.next_including_whitespace_and_comments(), Token::Ident(atom!("color"))); + assert_eq!(lex.pos(), 12); + assert_eq!(lex.next_including_whitespace_and_comments(), Token::Colon); + assert_eq!(lex.pos(), 13); + assert_eq!(lex.next_including_whitespace_and_comments(), Token::Whitespace); + assert_eq!(lex.pos(), 14); + assert_eq!(lex.next_including_whitespace_and_comments(), Token::Ident(atom!("black"))); + assert_eq!(lex.pos(), 19); + assert_eq!(lex.next_including_whitespace_and_comments(), Token::Whitespace); + assert_eq!(lex.pos(), 20); + assert_eq!(lex.next_including_whitespace_and_comments(), Token::RightCurly); + assert_eq!(lex.pos(), 21); + assert_eq!(lex.next_including_whitespace_and_comments(), Token::Comment); + assert_eq!(lex.pos(), 30); + assert_eq!(lex.next_including_whitespace_and_comments(), Token::Eof); + assert_eq!(lex.pos(), 30); +} + +#[test] +fn skips_whitespace_and_comments_with_next() { + let allocator = Allocator::default(); + let mut lex = Lexer::new(&allocator, "body { color: black }/* fin */"); + assert_eq!(lex.pos(), 0); + assert_eq!(lex.next_token(), Token::Ident(atom!("body"))); + assert_eq!(lex.pos(), 4); + assert_eq!(lex.next_token(), Token::LeftCurly); + assert_eq!(lex.pos(), 6); assert_eq!(lex.next_token(), Token::Ident(atom!("color"))); assert_eq!(lex.pos(), 12); assert_eq!(lex.next_token(), Token::Colon); assert_eq!(lex.pos(), 13); - assert_eq!(lex.next_token(), Token::Whitespace); - assert_eq!(lex.pos(), 14); assert_eq!(lex.next_token(), Token::Ident(atom!("black"))); assert_eq!(lex.pos(), 19); - assert_eq!(lex.next_token(), Token::Whitespace); - assert_eq!(lex.pos(), 20); assert_eq!(lex.next_token(), Token::RightCurly); assert_eq!(lex.pos(), 21); assert_eq!(lex.next_token(), Token::Eof); - assert_eq!(lex.pos(), 21); + assert_eq!(lex.pos(), 30); }