From 987a74171e1759a96ae7e7ed2cf9df63c00d16b5 Mon Sep 17 00:00:00 2001
From: Keith Cirkel <keithamus@users.noreply.github.com>
Date: Sun, 4 Feb 2024 13:19:55 +0000
Subject: [PATCH] embed trivia skipping in lexer

---
 Cargo.lock                      | 11 +++++
 Cargo.toml                      |  1 +
 crates/hdx_lexer/Cargo.toml     |  1 +
 crates/hdx_lexer/src/lib.rs     | 30 +++++++++++++-
 crates/hdx_lexer/src/private.rs | 41 +++++++++++++------
 crates/hdx_lexer/tests/tests.rs | 71 ++++++++++++++++++++++-----------
 6 files changed, 117 insertions(+), 38 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index ef5e1aa4..02e0b69c 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -104,6 +104,16 @@ version = "2.3.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "630be753d4e58660abd17930c71b647fe46c27ea6b63cc59e1e3851406972e42"
 
+[[package]]
+name = "bitmask-enum"
+version = "2.2.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9990737a6d5740ff51cdbbc0f0503015cb30c390f6623968281eb214a520cfc0"
+dependencies = [
+ "quote",
+ "syn",
+]
+
 [[package]]
 name = "bstr"
 version = "1.6.0"
@@ -607,6 +617,7 @@ dependencies = [
 name = "hdx_lexer"
 version = "0.0.1"
 dependencies = [
+ "bitmask-enum",
  "bumpalo",
  "hdx_atom",
  "hdx_syntax",
diff --git a/Cargo.toml b/Cargo.toml
index 209ce215..02847a17 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -35,6 +35,7 @@ serde = { version = "1.0.171" }
 serde_json = { version = "1.0.102" }
 syn = { version = "2.0.26" }
 quote = { version = "1.0.31" }
+bitmask-enum = { version = "2.2.1" }
 
 glob = { version = "0.3.1" }
 pico-args = { version = "0.5.0" }
diff --git a/crates/hdx_lexer/Cargo.toml b/crates/hdx_lexer/Cargo.toml
index bbe23943..bc974819 100644
--- a/crates/hdx_lexer/Cargo.toml
+++ b/crates/hdx_lexer/Cargo.toml
@@ -16,6 +16,7 @@ hdx_atom.workspace = true
 # Use OXC Allocator until https://github.com/fitzgen/bumpalo/pull/210 is resolved
 oxc_allocator = { workspace = true }
 bumpalo = { workspace = true, features = ["boxed", "collections"] }
+bitmask-enum = { workspace = true }
 
 miette = { workspace = true }
 serde = { workspace = true, features = ["derive"], optional = true }
diff --git a/crates/hdx_lexer/src/lib.rs b/crates/hdx_lexer/src/lib.rs
index 8d8e4ddc..dd7d8de1 100644
--- a/crates/hdx_lexer/src/lib.rs
+++ b/crates/hdx_lexer/src/lib.rs
@@ -5,6 +5,7 @@ mod token;
 
 use std::{collections::VecDeque, str::Chars};
 
+use bitmask_enum::bitmask;
 use oxc_allocator::Allocator;
 pub use token::{NumType, PairWise, Token};
 
@@ -15,18 +16,31 @@ pub struct LexerCheckpoint<'a> {
 	prev_pos: u32,
 }
 
+#[bitmask(u8)]
+pub(crate) enum Include {
+	Whitespace = 0b0001,
+	Comments = 0b0010,
+}
+
 pub struct Lexer<'a> {
 	allocator: &'a Allocator,
 	source: &'a str,
 	current: LexerCheckpoint<'a>,
 	lookahead: VecDeque<LexerCheckpoint<'a>>,
+	include: Include,
 }
 
 impl<'a> Lexer<'a> {
 	pub fn new(allocator: &'a Allocator, source: &'a str) -> Self {
 		let token = Token::default();
 		let current = LexerCheckpoint { chars: source.chars(), token, prev_pos: 0 };
-		Self { allocator, source, current, lookahead: VecDeque::with_capacity(4) }
+		Self {
+			allocator,
+			source,
+			current,
+			lookahead: VecDeque::with_capacity(4),
+			include: Include::none(),
+		}
 	}
 
 	/// Remaining string from `Chars`
@@ -108,4 +122,18 @@ impl<'a> Lexer<'a> {
 		}
 		self.read_next_token()
 	}
+
+	pub fn next_including_whitespace(&mut self) -> Token {
+		self.include = Include::Whitespace;
+		let token = self.read_next_token();
+		self.include = Include::none();
+		token
+	}
+
+	pub fn next_including_whitespace_and_comments(&mut self) -> Token {
+		self.include = Include::all();
+		let token = self.read_next_token();
+		self.include = Include::none();
+		token
+	}
 }
diff --git a/crates/hdx_lexer/src/private.rs b/crates/hdx_lexer/src/private.rs
index cd068a84..17fd318a 100644
--- a/crates/hdx_lexer/src/private.rs
+++ b/crates/hdx_lexer/src/private.rs
@@ -10,10 +10,20 @@ use crate::{
 	constants::{SINGLE_CHAR_TOKENS, SURROGATE_RANGE},
 	string_builder::AutoCow,
 	token::{NumType, Token},
-	Lexer,
+	Include, Lexer,
 };
 
 impl<'a> Lexer<'a> {
+	#[inline]
+	fn include_whitspace(&self) -> bool {
+		self.include & Include::Whitespace == Include::Whitespace
+	}
+
+	#[inline]
+	fn include_comments(&self) -> bool {
+		self.include & Include::Comments == Include::Comments
+	}
+
 	#[inline]
 	fn nth(&self, n: usize) -> char {
 		self.current.chars.clone().nth(n).unwrap_or(EOF)
@@ -41,7 +51,13 @@ impl<'a> Lexer<'a> {
 		}
 		match c {
 			// Whitespace Range
-			c if is_whitespace(c) => self.consume_whitespace(),
+			c if is_whitespace(c) => {
+				self.consume_whitespace();
+				if self.include_whitspace() {
+					return Token::Whitespace;
+				}
+				self.read_next_token()
+			}
 			// Quote Range
 			c if is_quote(c) => self.consume_string_token(),
 			// Digit Range
@@ -110,7 +126,11 @@ impl<'a> Lexer<'a> {
 				'*' => {
 					self.current.chars.next();
 					self.current.chars.next();
-					self.consume_comment_token()
+					self.consume_comment();
+					if self.include_comments() {
+						return Token::Comment;
+					}
+					self.read_next_token()
 				}
 				_ => Token::Delim(self.current.chars.next().unwrap()),
 			},
@@ -121,13 +141,9 @@ impl<'a> Lexer<'a> {
 		}
 	}
 
-	fn consume_whitespace(&mut self) -> Token {
-		loop {
-			if is_whitespace(self.nth(0)) {
-				self.current.chars.next();
-			} else {
-				return Token::Whitespace;
-			}
+	fn consume_whitespace(&mut self) {
+		while is_whitespace(self.nth(0)) {
+			self.current.chars.next();
 		}
 	}
 
@@ -353,14 +369,13 @@ impl<'a> Lexer<'a> {
 		}
 	}
 
-	fn consume_comment_token(&mut self) -> Token {
+	fn consume_comment(&mut self) {
 		while let Some(c) = self.current.chars.next() {
 			if c == '*' && self.nth(0) == '/' {
 				self.current.chars.next();
-				return Token::Comment;
+				return;
 			}
 		}
-		Token::Comment
 	}
 
 	fn is_number_start(&mut self) -> bool {
diff --git a/crates/hdx_lexer/tests/tests.rs b/crates/hdx_lexer/tests/tests.rs
index 1b3df580..c1520165 100644
--- a/crates/hdx_lexer/tests/tests.rs
+++ b/crates/hdx_lexer/tests/tests.rs
@@ -12,9 +12,9 @@ fn empty() {
 	let allocator = Allocator::default();
 	let mut lex = Lexer::new(&allocator, "");
 	assert_eq!(lex.pos(), 0);
-	assert_eq!(lex.next_token(), Token::Eof);
+	assert_eq!(lex.next_including_whitespace_and_comments(), Token::Eof);
 	assert_eq!(lex.pos(), 0);
-	assert_eq!(lex.next_token(), Token::Eof);
+	assert_eq!(lex.next_including_whitespace_and_comments(), Token::Eof);
 	assert_eq!(lex.pos(), 0);
 }
 
@@ -23,11 +23,11 @@ fn tokenizes_tilde_as_ddelim() {
 	let allocator = Allocator::default();
 	let mut lex = Lexer::new(&allocator, "~");
 	assert_eq!(lex.pos(), 0);
-	assert_eq!(lex.next_token(), Token::Delim('~'));
+	assert_eq!(lex.next_including_whitespace_and_comments(), Token::Delim('~'));
 	assert_eq!(lex.pos(), 1);
-	assert_eq!(lex.next_token(), Token::Eof);
+	assert_eq!(lex.next_including_whitespace_and_comments(), Token::Eof);
 	assert_eq!(lex.pos(), 1);
-	assert_eq!(lex.next_token(), Token::Eof);
+	assert_eq!(lex.next_including_whitespace_and_comments(), Token::Eof);
 	assert_eq!(lex.pos(), 1);
 }
 
@@ -36,11 +36,11 @@ fn tokenizes_newlines_as_whitespace() {
 	let allocator = Allocator::default();
 	let mut lex = Lexer::new(&allocator, "\r\n");
 	assert_eq!(lex.pos(), 0);
-	assert_eq!(lex.next_token(), Token::Whitespace);
+	assert_eq!(lex.next_including_whitespace_and_comments(), Token::Whitespace);
 	assert_eq!(lex.pos(), 2);
-	assert_eq!(lex.next_token(), Token::Eof);
+	assert_eq!(lex.next_including_whitespace_and_comments(), Token::Eof);
 	assert_eq!(lex.pos(), 2);
-	assert_eq!(lex.next_token(), Token::Eof);
+	assert_eq!(lex.next_including_whitespace_and_comments(), Token::Eof);
 	assert_eq!(lex.pos(), 2);
 }
 
@@ -49,11 +49,11 @@ fn tokenizes_multiple_newlines_as_whitespace() {
 	let allocator = Allocator::default();
 	let mut lex = Lexer::new(&allocator, "\r\n");
 	assert_eq!(lex.pos(), 0);
-	assert_eq!(lex.next_token(), Token::Whitespace);
+	assert_eq!(lex.next_including_whitespace_and_comments(), Token::Whitespace);
 	assert_eq!(lex.pos(), 2);
-	assert_eq!(lex.next_token(), Token::Eof);
+	assert_eq!(lex.next_including_whitespace_and_comments(), Token::Eof);
 	assert_eq!(lex.pos(), 2);
-	assert_eq!(lex.next_token(), Token::Eof);
+	assert_eq!(lex.next_including_whitespace_and_comments(), Token::Eof);
 	assert_eq!(lex.pos(), 2);
 }
 
@@ -62,39 +62,62 @@ fn tokenizes_multiple_whitespace_as_whitespace() {
 	let allocator = Allocator::default();
 	let mut lex = Lexer::new(&allocator, "\t \t \t");
 	assert_eq!(lex.pos(), 0);
-	assert_eq!(lex.next_token(), Token::Whitespace);
+	assert_eq!(lex.next_including_whitespace_and_comments(), Token::Whitespace);
 	assert_eq!(lex.pos(), 5);
-	assert_eq!(lex.next_token(), Token::Eof);
+	assert_eq!(lex.next_including_whitespace_and_comments(), Token::Eof);
 	assert_eq!(lex.pos(), 5);
-	assert_eq!(lex.next_token(), Token::Eof);
+	assert_eq!(lex.next_including_whitespace_and_comments(), Token::Eof);
 	assert_eq!(lex.pos(), 5);
 }
 
 #[test]
 fn tokenizes_trivial_css_file() {
 	let allocator = Allocator::default();
-	let mut lex = Lexer::new(&allocator, "body { color: black }");
+	let mut lex = Lexer::new(&allocator, "body { color: black }/* fin */");
 	assert_eq!(lex.pos(), 0);
-	assert_eq!(lex.next_token(), Token::Ident(atom!("body")));
+	assert_eq!(lex.next_including_whitespace_and_comments(), Token::Ident(atom!("body")));
 	assert_eq!(lex.pos(), 4);
-	assert_eq!(lex.next_token(), Token::Whitespace);
+	assert_eq!(lex.next_including_whitespace_and_comments(), Token::Whitespace);
 	assert_eq!(lex.pos(), 5);
-	assert_eq!(lex.next_token(), Token::LeftCurly);
+	assert_eq!(lex.next_including_whitespace_and_comments(), Token::LeftCurly);
 	assert_eq!(lex.pos(), 6);
-	assert_eq!(lex.next_token(), Token::Whitespace);
+	assert_eq!(lex.next_including_whitespace_and_comments(), Token::Whitespace);
 	assert_eq!(lex.pos(), 7);
+	assert_eq!(lex.next_including_whitespace_and_comments(), Token::Ident(atom!("color")));
+	assert_eq!(lex.pos(), 12);
+	assert_eq!(lex.next_including_whitespace_and_comments(), Token::Colon);
+	assert_eq!(lex.pos(), 13);
+	assert_eq!(lex.next_including_whitespace_and_comments(), Token::Whitespace);
+	assert_eq!(lex.pos(), 14);
+	assert_eq!(lex.next_including_whitespace_and_comments(), Token::Ident(atom!("black")));
+	assert_eq!(lex.pos(), 19);
+	assert_eq!(lex.next_including_whitespace_and_comments(), Token::Whitespace);
+	assert_eq!(lex.pos(), 20);
+	assert_eq!(lex.next_including_whitespace_and_comments(), Token::RightCurly);
+	assert_eq!(lex.pos(), 21);
+	assert_eq!(lex.next_including_whitespace_and_comments(), Token::Comment);
+	assert_eq!(lex.pos(), 30);
+	assert_eq!(lex.next_including_whitespace_and_comments(), Token::Eof);
+	assert_eq!(lex.pos(), 30);
+}
+
+#[test]
+fn skips_whitespace_and_comments_with_next() {
+	let allocator = Allocator::default();
+	let mut lex = Lexer::new(&allocator, "body { color: black }/* fin */");
+	assert_eq!(lex.pos(), 0);
+	assert_eq!(lex.next_token(), Token::Ident(atom!("body")));
+	assert_eq!(lex.pos(), 4);
+	assert_eq!(lex.next_token(), Token::LeftCurly);
+	assert_eq!(lex.pos(), 6);
 	assert_eq!(lex.next_token(), Token::Ident(atom!("color")));
 	assert_eq!(lex.pos(), 12);
 	assert_eq!(lex.next_token(), Token::Colon);
 	assert_eq!(lex.pos(), 13);
-	assert_eq!(lex.next_token(), Token::Whitespace);
-	assert_eq!(lex.pos(), 14);
 	assert_eq!(lex.next_token(), Token::Ident(atom!("black")));
 	assert_eq!(lex.pos(), 19);
-	assert_eq!(lex.next_token(), Token::Whitespace);
-	assert_eq!(lex.pos(), 20);
 	assert_eq!(lex.next_token(), Token::RightCurly);
 	assert_eq!(lex.pos(), 21);
 	assert_eq!(lex.next_token(), Token::Eof);
-	assert_eq!(lex.pos(), 21);
+	assert_eq!(lex.pos(), 30);
 }