Skip to content

Commit 987a741

Browse files
committed
embed trivia skipping in lexer
1 parent e56767f commit 987a741

File tree

6 files changed

+117
-38
lines changed

6 files changed

+117
-38
lines changed

Cargo.lock

Lines changed: 11 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@ serde = { version = "1.0.171" }
3535
serde_json = { version = "1.0.102" }
3636
syn = { version = "2.0.26" }
3737
quote = { version = "1.0.31" }
38+
bitmask-enum = { version = "2.2.1" }
3839

3940
glob = { version = "0.3.1" }
4041
pico-args = { version = "0.5.0" }

crates/hdx_lexer/Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ hdx_atom.workspace = true
1616
# Use OXC Allocator until https://github.com/fitzgen/bumpalo/pull/210 is resolved
1717
oxc_allocator = { workspace = true }
1818
bumpalo = { workspace = true, features = ["boxed", "collections"] }
19+
bitmask-enum = { workspace = true }
1920

2021
miette = { workspace = true }
2122
serde = { workspace = true, features = ["derive"], optional = true }

crates/hdx_lexer/src/lib.rs

Lines changed: 29 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ mod token;
55

66
use std::{collections::VecDeque, str::Chars};
77

8+
use bitmask_enum::bitmask;
89
use oxc_allocator::Allocator;
910
pub use token::{NumType, PairWise, Token};
1011

@@ -15,18 +16,31 @@ pub struct LexerCheckpoint<'a> {
1516
prev_pos: u32,
1617
}
1718

19+
#[bitmask(u8)]
20+
pub(crate) enum Include {
21+
Whitespace = 0b0001,
22+
Comments = 0b0010,
23+
}
24+
1825
pub struct Lexer<'a> {
1926
allocator: &'a Allocator,
2027
source: &'a str,
2128
current: LexerCheckpoint<'a>,
2229
lookahead: VecDeque<LexerCheckpoint<'a>>,
30+
include: Include,
2331
}
2432

2533
impl<'a> Lexer<'a> {
2634
pub fn new(allocator: &'a Allocator, source: &'a str) -> Self {
2735
let token = Token::default();
2836
let current = LexerCheckpoint { chars: source.chars(), token, prev_pos: 0 };
29-
Self { allocator, source, current, lookahead: VecDeque::with_capacity(4) }
37+
Self {
38+
allocator,
39+
source,
40+
current,
41+
lookahead: VecDeque::with_capacity(4),
42+
include: Include::none(),
43+
}
3044
}
3145

3246
/// Remaining string from `Chars`
@@ -108,4 +122,18 @@ impl<'a> Lexer<'a> {
108122
}
109123
self.read_next_token()
110124
}
125+
126+
pub fn next_including_whitespace(&mut self) -> Token {
127+
self.include = Include::Whitespace;
128+
let token = self.read_next_token();
129+
self.include = Include::none();
130+
token
131+
}
132+
133+
pub fn next_including_whitespace_and_comments(&mut self) -> Token {
134+
self.include = Include::all();
135+
let token = self.read_next_token();
136+
self.include = Include::none();
137+
token
138+
}
111139
}

crates/hdx_lexer/src/private.rs

Lines changed: 28 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -10,10 +10,20 @@ use crate::{
1010
constants::{SINGLE_CHAR_TOKENS, SURROGATE_RANGE},
1111
string_builder::AutoCow,
1212
token::{NumType, Token},
13-
Lexer,
13+
Include, Lexer,
1414
};
1515

1616
impl<'a> Lexer<'a> {
17+
#[inline]
18+
fn include_whitspace(&self) -> bool {
19+
self.include & Include::Whitespace == Include::Whitespace
20+
}
21+
22+
#[inline]
23+
fn include_comments(&self) -> bool {
24+
self.include & Include::Comments == Include::Comments
25+
}
26+
1727
#[inline]
1828
fn nth(&self, n: usize) -> char {
1929
self.current.chars.clone().nth(n).unwrap_or(EOF)
@@ -41,7 +51,13 @@ impl<'a> Lexer<'a> {
4151
}
4252
match c {
4353
// Whitespace Range
44-
c if is_whitespace(c) => self.consume_whitespace(),
54+
c if is_whitespace(c) => {
55+
self.consume_whitespace();
56+
if self.include_whitspace() {
57+
return Token::Whitespace;
58+
}
59+
self.read_next_token()
60+
}
4561
// Quote Range
4662
c if is_quote(c) => self.consume_string_token(),
4763
// Digit Range
@@ -110,7 +126,11 @@ impl<'a> Lexer<'a> {
110126
'*' => {
111127
self.current.chars.next();
112128
self.current.chars.next();
113-
self.consume_comment_token()
129+
self.consume_comment();
130+
if self.include_comments() {
131+
return Token::Comment;
132+
}
133+
self.read_next_token()
114134
}
115135
_ => Token::Delim(self.current.chars.next().unwrap()),
116136
},
@@ -121,13 +141,9 @@ impl<'a> Lexer<'a> {
121141
}
122142
}
123143

124-
fn consume_whitespace(&mut self) -> Token {
125-
loop {
126-
if is_whitespace(self.nth(0)) {
127-
self.current.chars.next();
128-
} else {
129-
return Token::Whitespace;
130-
}
144+
fn consume_whitespace(&mut self) {
145+
while is_whitespace(self.nth(0)) {
146+
self.current.chars.next();
131147
}
132148
}
133149

@@ -353,14 +369,13 @@ impl<'a> Lexer<'a> {
353369
}
354370
}
355371

356-
fn consume_comment_token(&mut self) -> Token {
372+
fn consume_comment(&mut self) {
357373
while let Some(c) = self.current.chars.next() {
358374
if c == '*' && self.nth(0) == '/' {
359375
self.current.chars.next();
360-
return Token::Comment;
376+
return;
361377
}
362378
}
363-
Token::Comment
364379
}
365380

366381
fn is_number_start(&mut self) -> bool {

crates/hdx_lexer/tests/tests.rs

Lines changed: 47 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -12,9 +12,9 @@ fn empty() {
1212
let allocator = Allocator::default();
1313
let mut lex = Lexer::new(&allocator, "");
1414
assert_eq!(lex.pos(), 0);
15-
assert_eq!(lex.next_token(), Token::Eof);
15+
assert_eq!(lex.next_including_whitespace_and_comments(), Token::Eof);
1616
assert_eq!(lex.pos(), 0);
17-
assert_eq!(lex.next_token(), Token::Eof);
17+
assert_eq!(lex.next_including_whitespace_and_comments(), Token::Eof);
1818
assert_eq!(lex.pos(), 0);
1919
}
2020

@@ -23,11 +23,11 @@ fn tokenizes_tilde_as_ddelim() {
2323
let allocator = Allocator::default();
2424
let mut lex = Lexer::new(&allocator, "~");
2525
assert_eq!(lex.pos(), 0);
26-
assert_eq!(lex.next_token(), Token::Delim('~'));
26+
assert_eq!(lex.next_including_whitespace_and_comments(), Token::Delim('~'));
2727
assert_eq!(lex.pos(), 1);
28-
assert_eq!(lex.next_token(), Token::Eof);
28+
assert_eq!(lex.next_including_whitespace_and_comments(), Token::Eof);
2929
assert_eq!(lex.pos(), 1);
30-
assert_eq!(lex.next_token(), Token::Eof);
30+
assert_eq!(lex.next_including_whitespace_and_comments(), Token::Eof);
3131
assert_eq!(lex.pos(), 1);
3232
}
3333

@@ -36,11 +36,11 @@ fn tokenizes_newlines_as_whitespace() {
3636
let allocator = Allocator::default();
3737
let mut lex = Lexer::new(&allocator, "\r\n");
3838
assert_eq!(lex.pos(), 0);
39-
assert_eq!(lex.next_token(), Token::Whitespace);
39+
assert_eq!(lex.next_including_whitespace_and_comments(), Token::Whitespace);
4040
assert_eq!(lex.pos(), 2);
41-
assert_eq!(lex.next_token(), Token::Eof);
41+
assert_eq!(lex.next_including_whitespace_and_comments(), Token::Eof);
4242
assert_eq!(lex.pos(), 2);
43-
assert_eq!(lex.next_token(), Token::Eof);
43+
assert_eq!(lex.next_including_whitespace_and_comments(), Token::Eof);
4444
assert_eq!(lex.pos(), 2);
4545
}
4646

@@ -49,11 +49,11 @@ fn tokenizes_multiple_newlines_as_whitespace() {
4949
let allocator = Allocator::default();
5050
let mut lex = Lexer::new(&allocator, "\r\n");
5151
assert_eq!(lex.pos(), 0);
52-
assert_eq!(lex.next_token(), Token::Whitespace);
52+
assert_eq!(lex.next_including_whitespace_and_comments(), Token::Whitespace);
5353
assert_eq!(lex.pos(), 2);
54-
assert_eq!(lex.next_token(), Token::Eof);
54+
assert_eq!(lex.next_including_whitespace_and_comments(), Token::Eof);
5555
assert_eq!(lex.pos(), 2);
56-
assert_eq!(lex.next_token(), Token::Eof);
56+
assert_eq!(lex.next_including_whitespace_and_comments(), Token::Eof);
5757
assert_eq!(lex.pos(), 2);
5858
}
5959

@@ -62,39 +62,62 @@ fn tokenizes_multiple_whitespace_as_whitespace() {
6262
let allocator = Allocator::default();
6363
let mut lex = Lexer::new(&allocator, "\t \t \t");
6464
assert_eq!(lex.pos(), 0);
65-
assert_eq!(lex.next_token(), Token::Whitespace);
65+
assert_eq!(lex.next_including_whitespace_and_comments(), Token::Whitespace);
6666
assert_eq!(lex.pos(), 5);
67-
assert_eq!(lex.next_token(), Token::Eof);
67+
assert_eq!(lex.next_including_whitespace_and_comments(), Token::Eof);
6868
assert_eq!(lex.pos(), 5);
69-
assert_eq!(lex.next_token(), Token::Eof);
69+
assert_eq!(lex.next_including_whitespace_and_comments(), Token::Eof);
7070
assert_eq!(lex.pos(), 5);
7171
}
7272

7373
#[test]
7474
fn tokenizes_trivial_css_file() {
7575
let allocator = Allocator::default();
76-
let mut lex = Lexer::new(&allocator, "body { color: black }");
76+
let mut lex = Lexer::new(&allocator, "body { color: black }/* fin */");
7777
assert_eq!(lex.pos(), 0);
78-
assert_eq!(lex.next_token(), Token::Ident(atom!("body")));
78+
assert_eq!(lex.next_including_whitespace_and_comments(), Token::Ident(atom!("body")));
7979
assert_eq!(lex.pos(), 4);
80-
assert_eq!(lex.next_token(), Token::Whitespace);
80+
assert_eq!(lex.next_including_whitespace_and_comments(), Token::Whitespace);
8181
assert_eq!(lex.pos(), 5);
82-
assert_eq!(lex.next_token(), Token::LeftCurly);
82+
assert_eq!(lex.next_including_whitespace_and_comments(), Token::LeftCurly);
8383
assert_eq!(lex.pos(), 6);
84-
assert_eq!(lex.next_token(), Token::Whitespace);
84+
assert_eq!(lex.next_including_whitespace_and_comments(), Token::Whitespace);
8585
assert_eq!(lex.pos(), 7);
86+
assert_eq!(lex.next_including_whitespace_and_comments(), Token::Ident(atom!("color")));
87+
assert_eq!(lex.pos(), 12);
88+
assert_eq!(lex.next_including_whitespace_and_comments(), Token::Colon);
89+
assert_eq!(lex.pos(), 13);
90+
assert_eq!(lex.next_including_whitespace_and_comments(), Token::Whitespace);
91+
assert_eq!(lex.pos(), 14);
92+
assert_eq!(lex.next_including_whitespace_and_comments(), Token::Ident(atom!("black")));
93+
assert_eq!(lex.pos(), 19);
94+
assert_eq!(lex.next_including_whitespace_and_comments(), Token::Whitespace);
95+
assert_eq!(lex.pos(), 20);
96+
assert_eq!(lex.next_including_whitespace_and_comments(), Token::RightCurly);
97+
assert_eq!(lex.pos(), 21);
98+
assert_eq!(lex.next_including_whitespace_and_comments(), Token::Comment);
99+
assert_eq!(lex.pos(), 30);
100+
assert_eq!(lex.next_including_whitespace_and_comments(), Token::Eof);
101+
assert_eq!(lex.pos(), 30);
102+
}
103+
104+
#[test]
105+
fn skips_whitespace_and_comments_with_next() {
106+
let allocator = Allocator::default();
107+
let mut lex = Lexer::new(&allocator, "body { color: black }/* fin */");
108+
assert_eq!(lex.pos(), 0);
109+
assert_eq!(lex.next_token(), Token::Ident(atom!("body")));
110+
assert_eq!(lex.pos(), 4);
111+
assert_eq!(lex.next_token(), Token::LeftCurly);
112+
assert_eq!(lex.pos(), 6);
86113
assert_eq!(lex.next_token(), Token::Ident(atom!("color")));
87114
assert_eq!(lex.pos(), 12);
88115
assert_eq!(lex.next_token(), Token::Colon);
89116
assert_eq!(lex.pos(), 13);
90-
assert_eq!(lex.next_token(), Token::Whitespace);
91-
assert_eq!(lex.pos(), 14);
92117
assert_eq!(lex.next_token(), Token::Ident(atom!("black")));
93118
assert_eq!(lex.pos(), 19);
94-
assert_eq!(lex.next_token(), Token::Whitespace);
95-
assert_eq!(lex.pos(), 20);
96119
assert_eq!(lex.next_token(), Token::RightCurly);
97120
assert_eq!(lex.pos(), 21);
98121
assert_eq!(lex.next_token(), Token::Eof);
99-
assert_eq!(lex.pos(), 21);
122+
assert_eq!(lex.pos(), 30);
100123
}

0 commit comments

Comments
 (0)