Skip to content

Commit

Permalink
embed trivia skipping in lexer
Browse files Browse the repository at this point in the history
  • Loading branch information
keithamus committed Feb 4, 2024
1 parent e56767f commit 987a741
Show file tree
Hide file tree
Showing 6 changed files with 117 additions and 38 deletions.
11 changes: 11 additions & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ serde = { version = "1.0.171" }
serde_json = { version = "1.0.102" }
syn = { version = "2.0.26" }
quote = { version = "1.0.31" }
bitmask-enum = { version = "2.2.1" }

glob = { version = "0.3.1" }
pico-args = { version = "0.5.0" }
Expand Down
1 change: 1 addition & 0 deletions crates/hdx_lexer/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ hdx_atom.workspace = true
# Use OXC Allocator until https://github.com/fitzgen/bumpalo/pull/210 is resolved
oxc_allocator = { workspace = true }
bumpalo = { workspace = true, features = ["boxed", "collections"] }
bitmask-enum = { workspace = true }

miette = { workspace = true }
serde = { workspace = true, features = ["derive"], optional = true }
Expand Down
30 changes: 29 additions & 1 deletion crates/hdx_lexer/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ mod token;

use std::{collections::VecDeque, str::Chars};

use bitmask_enum::bitmask;
use oxc_allocator::Allocator;
pub use token::{NumType, PairWise, Token};

Expand All @@ -15,18 +16,31 @@ pub struct LexerCheckpoint<'a> {
prev_pos: u32,
}

#[bitmask(u8)]
pub(crate) enum Include {
Whitespace = 0b0001,
Comments = 0b0010,
}

pub struct Lexer<'a> {
allocator: &'a Allocator,
source: &'a str,
current: LexerCheckpoint<'a>,
lookahead: VecDeque<LexerCheckpoint<'a>>,
include: Include,
}

impl<'a> Lexer<'a> {
pub fn new(allocator: &'a Allocator, source: &'a str) -> Self {
let token = Token::default();
let current = LexerCheckpoint { chars: source.chars(), token, prev_pos: 0 };
Self { allocator, source, current, lookahead: VecDeque::with_capacity(4) }
Self {
allocator,
source,
current,
lookahead: VecDeque::with_capacity(4),
include: Include::none(),
}
}

/// Remaining string from `Chars`
Expand Down Expand Up @@ -108,4 +122,18 @@ impl<'a> Lexer<'a> {
}
self.read_next_token()
}

pub fn next_including_whitespace(&mut self) -> Token {
self.include = Include::Whitespace;
let token = self.read_next_token();
self.include = Include::none();
token
}

pub fn next_including_whitespace_and_comments(&mut self) -> Token {
self.include = Include::all();
let token = self.read_next_token();
self.include = Include::none();
token
}
}
41 changes: 28 additions & 13 deletions crates/hdx_lexer/src/private.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,10 +10,20 @@ use crate::{
constants::{SINGLE_CHAR_TOKENS, SURROGATE_RANGE},
string_builder::AutoCow,
token::{NumType, Token},
Lexer,
Include, Lexer,
};

impl<'a> Lexer<'a> {
#[inline]
fn include_whitspace(&self) -> bool {
self.include & Include::Whitespace == Include::Whitespace
}

#[inline]
fn include_comments(&self) -> bool {
self.include & Include::Comments == Include::Comments
}

#[inline]
fn nth(&self, n: usize) -> char {
self.current.chars.clone().nth(n).unwrap_or(EOF)
Expand Down Expand Up @@ -41,7 +51,13 @@ impl<'a> Lexer<'a> {
}
match c {
// Whitespace Range
c if is_whitespace(c) => self.consume_whitespace(),
c if is_whitespace(c) => {
self.consume_whitespace();
if self.include_whitspace() {
return Token::Whitespace;
}
self.read_next_token()
}
// Quote Range
c if is_quote(c) => self.consume_string_token(),
// Digit Range
Expand Down Expand Up @@ -110,7 +126,11 @@ impl<'a> Lexer<'a> {
'*' => {
self.current.chars.next();
self.current.chars.next();
self.consume_comment_token()
self.consume_comment();
if self.include_comments() {
return Token::Comment;
}
self.read_next_token()
}
_ => Token::Delim(self.current.chars.next().unwrap()),
},
Expand All @@ -121,13 +141,9 @@ impl<'a> Lexer<'a> {
}
}

fn consume_whitespace(&mut self) -> Token {
loop {
if is_whitespace(self.nth(0)) {
self.current.chars.next();
} else {
return Token::Whitespace;
}
fn consume_whitespace(&mut self) {
while is_whitespace(self.nth(0)) {
self.current.chars.next();
}
}

Expand Down Expand Up @@ -353,14 +369,13 @@ impl<'a> Lexer<'a> {
}
}

fn consume_comment_token(&mut self) -> Token {
fn consume_comment(&mut self) {
while let Some(c) = self.current.chars.next() {
if c == '*' && self.nth(0) == '/' {
self.current.chars.next();
return Token::Comment;
return;
}
}
Token::Comment
}

fn is_number_start(&mut self) -> bool {
Expand Down
71 changes: 47 additions & 24 deletions crates/hdx_lexer/tests/tests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,9 @@ fn empty() {
let allocator = Allocator::default();
let mut lex = Lexer::new(&allocator, "");
assert_eq!(lex.pos(), 0);
assert_eq!(lex.next_token(), Token::Eof);
assert_eq!(lex.next_including_whitespace_and_comments(), Token::Eof);
assert_eq!(lex.pos(), 0);
assert_eq!(lex.next_token(), Token::Eof);
assert_eq!(lex.next_including_whitespace_and_comments(), Token::Eof);
assert_eq!(lex.pos(), 0);
}

Expand All @@ -23,11 +23,11 @@ fn tokenizes_tilde_as_ddelim() {
let allocator = Allocator::default();
let mut lex = Lexer::new(&allocator, "~");
assert_eq!(lex.pos(), 0);
assert_eq!(lex.next_token(), Token::Delim('~'));
assert_eq!(lex.next_including_whitespace_and_comments(), Token::Delim('~'));
assert_eq!(lex.pos(), 1);
assert_eq!(lex.next_token(), Token::Eof);
assert_eq!(lex.next_including_whitespace_and_comments(), Token::Eof);
assert_eq!(lex.pos(), 1);
assert_eq!(lex.next_token(), Token::Eof);
assert_eq!(lex.next_including_whitespace_and_comments(), Token::Eof);
assert_eq!(lex.pos(), 1);
}

Expand All @@ -36,11 +36,11 @@ fn tokenizes_newlines_as_whitespace() {
let allocator = Allocator::default();
let mut lex = Lexer::new(&allocator, "\r\n");
assert_eq!(lex.pos(), 0);
assert_eq!(lex.next_token(), Token::Whitespace);
assert_eq!(lex.next_including_whitespace_and_comments(), Token::Whitespace);
assert_eq!(lex.pos(), 2);
assert_eq!(lex.next_token(), Token::Eof);
assert_eq!(lex.next_including_whitespace_and_comments(), Token::Eof);
assert_eq!(lex.pos(), 2);
assert_eq!(lex.next_token(), Token::Eof);
assert_eq!(lex.next_including_whitespace_and_comments(), Token::Eof);
assert_eq!(lex.pos(), 2);
}

Expand All @@ -49,11 +49,11 @@ fn tokenizes_multiple_newlines_as_whitespace() {
let allocator = Allocator::default();
let mut lex = Lexer::new(&allocator, "\r\n");
assert_eq!(lex.pos(), 0);
assert_eq!(lex.next_token(), Token::Whitespace);
assert_eq!(lex.next_including_whitespace_and_comments(), Token::Whitespace);
assert_eq!(lex.pos(), 2);
assert_eq!(lex.next_token(), Token::Eof);
assert_eq!(lex.next_including_whitespace_and_comments(), Token::Eof);
assert_eq!(lex.pos(), 2);
assert_eq!(lex.next_token(), Token::Eof);
assert_eq!(lex.next_including_whitespace_and_comments(), Token::Eof);
assert_eq!(lex.pos(), 2);
}

Expand All @@ -62,39 +62,62 @@ fn tokenizes_multiple_whitespace_as_whitespace() {
let allocator = Allocator::default();
let mut lex = Lexer::new(&allocator, "\t \t \t");
assert_eq!(lex.pos(), 0);
assert_eq!(lex.next_token(), Token::Whitespace);
assert_eq!(lex.next_including_whitespace_and_comments(), Token::Whitespace);
assert_eq!(lex.pos(), 5);
assert_eq!(lex.next_token(), Token::Eof);
assert_eq!(lex.next_including_whitespace_and_comments(), Token::Eof);
assert_eq!(lex.pos(), 5);
assert_eq!(lex.next_token(), Token::Eof);
assert_eq!(lex.next_including_whitespace_and_comments(), Token::Eof);
assert_eq!(lex.pos(), 5);
}

#[test]
fn tokenizes_trivial_css_file() {
let allocator = Allocator::default();
let mut lex = Lexer::new(&allocator, "body { color: black }");
let mut lex = Lexer::new(&allocator, "body { color: black }/* fin */");
assert_eq!(lex.pos(), 0);
assert_eq!(lex.next_token(), Token::Ident(atom!("body")));
assert_eq!(lex.next_including_whitespace_and_comments(), Token::Ident(atom!("body")));
assert_eq!(lex.pos(), 4);
assert_eq!(lex.next_token(), Token::Whitespace);
assert_eq!(lex.next_including_whitespace_and_comments(), Token::Whitespace);
assert_eq!(lex.pos(), 5);
assert_eq!(lex.next_token(), Token::LeftCurly);
assert_eq!(lex.next_including_whitespace_and_comments(), Token::LeftCurly);
assert_eq!(lex.pos(), 6);
assert_eq!(lex.next_token(), Token::Whitespace);
assert_eq!(lex.next_including_whitespace_and_comments(), Token::Whitespace);
assert_eq!(lex.pos(), 7);
assert_eq!(lex.next_including_whitespace_and_comments(), Token::Ident(atom!("color")));
assert_eq!(lex.pos(), 12);
assert_eq!(lex.next_including_whitespace_and_comments(), Token::Colon);
assert_eq!(lex.pos(), 13);
assert_eq!(lex.next_including_whitespace_and_comments(), Token::Whitespace);
assert_eq!(lex.pos(), 14);
assert_eq!(lex.next_including_whitespace_and_comments(), Token::Ident(atom!("black")));
assert_eq!(lex.pos(), 19);
assert_eq!(lex.next_including_whitespace_and_comments(), Token::Whitespace);
assert_eq!(lex.pos(), 20);
assert_eq!(lex.next_including_whitespace_and_comments(), Token::RightCurly);
assert_eq!(lex.pos(), 21);
assert_eq!(lex.next_including_whitespace_and_comments(), Token::Comment);
assert_eq!(lex.pos(), 30);
assert_eq!(lex.next_including_whitespace_and_comments(), Token::Eof);
assert_eq!(lex.pos(), 30);
}

#[test]
fn skips_whitespace_and_comments_with_next() {
let allocator = Allocator::default();
let mut lex = Lexer::new(&allocator, "body { color: black }/* fin */");
assert_eq!(lex.pos(), 0);
assert_eq!(lex.next_token(), Token::Ident(atom!("body")));
assert_eq!(lex.pos(), 4);
assert_eq!(lex.next_token(), Token::LeftCurly);
assert_eq!(lex.pos(), 6);
assert_eq!(lex.next_token(), Token::Ident(atom!("color")));
assert_eq!(lex.pos(), 12);
assert_eq!(lex.next_token(), Token::Colon);
assert_eq!(lex.pos(), 13);
assert_eq!(lex.next_token(), Token::Whitespace);
assert_eq!(lex.pos(), 14);
assert_eq!(lex.next_token(), Token::Ident(atom!("black")));
assert_eq!(lex.pos(), 19);
assert_eq!(lex.next_token(), Token::Whitespace);
assert_eq!(lex.pos(), 20);
assert_eq!(lex.next_token(), Token::RightCurly);
assert_eq!(lex.pos(), 21);
assert_eq!(lex.next_token(), Token::Eof);
assert_eq!(lex.pos(), 21);
assert_eq!(lex.pos(), 30);
}

0 comments on commit 987a741

Please sign in to comment.