diff --git a/Cargo.lock b/Cargo.lock index b9af347a..ef5e1aa4 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -530,6 +530,7 @@ dependencies = [ "hdx_atomizable_derive", "hdx_lexer", "hdx_syntax", + "miette", "oxc_allocator", "serde", "serde_json", diff --git a/crates/hdx_ast/Cargo.toml b/crates/hdx_ast/Cargo.toml index 7734a5bb..f710d806 100644 --- a/crates/hdx_ast/Cargo.toml +++ b/crates/hdx_ast/Cargo.toml @@ -18,6 +18,7 @@ hdx_atomizable_derive = { workspace = true } # Use OXC Allocator until https://github.com/fitzgen/bumpalo/pull/210 is resolved oxc_allocator = { workspace = true } bumpalo = { workspace = true, features = ["collections", "boxed"] } +miette = { workspace = true } serde = { workspace = true, optional = true } serde_json = { workspace = true, optional = true } diff --git a/crates/hdx_ast/src/css/qualified_rule.rs b/crates/hdx_ast/src/css/qualified_rule.rs index 423ba236..56356a8e 100644 --- a/crates/hdx_ast/src/css/qualified_rule.rs +++ b/crates/hdx_ast/src/css/qualified_rule.rs @@ -1,7 +1,8 @@ -use hdx_lexer::Span; #[cfg(feature = "serde")] use serde::Serialize; +use crate::Span; + #[derive(Debug, Hash)] #[cfg_attr(feature = "serde", derive(Serialize), serde(tag = "type", rename_all = "camelCase"))] pub struct QualifiedRule<'a> { diff --git a/crates/hdx_ast/src/css/rules/page.rs b/crates/hdx_ast/src/css/rules/page.rs index 6f2e3d32..1f606ac6 100644 --- a/crates/hdx_ast/src/css/rules/page.rs +++ b/crates/hdx_ast/src/css/rules/page.rs @@ -1,10 +1,9 @@ -use hdx_lexer::Span; use oxc_allocator::{Box, Vec}; #[cfg(feature = "serde")] use serde::Serialize; use crate::{ - atom, css::properties::Property, Atom, Atomizable, Spanned, Specificity, ToSpecificity, + atom, css::properties::Property, Atom, Atomizable, Span, Spanned, Specificity, ToSpecificity, }; // https://drafts.csswg.org/cssom-1/#csspagerule diff --git a/crates/hdx_ast/src/lib.rs b/crates/hdx_ast/src/lib.rs index 09a4c9f0..b7f722ac 100644 --- a/crates/hdx_ast/src/lib.rs +++ b/crates/hdx_ast/src/lib.rs @@ -1,5 +1,3 @@ -#![feature(slice_concat_trait)] - extern crate hdx_atomizable_derive; pub use hdx_atomizable_derive::Atomizable; @@ -7,11 +5,13 @@ pub use hdx_atomizable_derive::Atomizable; use serde::Serialize; pub mod css; +pub mod span; pub mod traits; pub(crate) use hdx_atom::{atom, Atom, Atomizable}; -pub(crate) use hdx_lexer::{PairWise, Span, Token}; +pub(crate) use hdx_lexer::{PairWise, Token}; pub(crate) use oxc_allocator::{Allocator, Box, Vec}; +pub(crate) use span::Span; pub use traits::Unit; #[derive(Debug, PartialEq, Hash)] diff --git a/crates/hdx_lexer/src/span.rs b/crates/hdx_ast/src/span.rs similarity index 100% rename from crates/hdx_lexer/src/span.rs rename to crates/hdx_ast/src/span.rs diff --git a/crates/hdx_lexer/src/constants.rs b/crates/hdx_lexer/src/constants.rs index 84e3863a..33285ed5 100644 --- a/crates/hdx_lexer/src/constants.rs +++ b/crates/hdx_lexer/src/constants.rs @@ -1,134 +1,134 @@ -use super::Kind; +use super::Token; pub const SURROGATE_RANGE: std::ops::RangeInclusive = 0xd800..=0xdfff; -pub const SINGLE_CHAR_TOKENS: &[Kind; 128] = &[ - /* 0 */ Kind::Undetermined, - /* 1 */ Kind::Undetermined, - /* 2 */ Kind::Undetermined, - /* 3 */ Kind::Undetermined, - /* 4 */ Kind::Undetermined, - /* 5 */ Kind::Undetermined, - /* 6 */ Kind::Undetermined, - /* 7 */ Kind::Undetermined, - /* 8 */ Kind::Undetermined, - /* 9 */ Kind::Undetermined, - /* 10 */ Kind::Undetermined, - /* 11 */ Kind::Undetermined, - /* 12 */ Kind::Undetermined, - /* 13 */ Kind::Undetermined, - /* 14 */ Kind::Undetermined, - /* 15 */ Kind::Undetermined, - /* 16 */ Kind::Undetermined, - /* 17 */ Kind::Undetermined, - /* 18 */ Kind::Undetermined, - /* 19 */ Kind::Undetermined, - /* 20 */ Kind::Undetermined, - /* 21 */ Kind::Undetermined, - /* 22 */ Kind::Undetermined, - /* 23 */ Kind::Undetermined, - /* 24 */ Kind::Undetermined, - /* 25 */ Kind::Undetermined, - /* 26 */ Kind::Undetermined, - /* 27 */ Kind::Undetermined, - /* 28 */ Kind::Undetermined, - /* 29 */ Kind::Undetermined, - /* 30 */ Kind::Undetermined, - /* 31 */ Kind::Undetermined, - /* 32 */ Kind::Undetermined, - /* 33 */ Kind::Undetermined, - /* 34 */ Kind::Undetermined, - /* 35 */ Kind::Undetermined, - /* 36 */ Kind::Undetermined, - /* 37 */ Kind::Undetermined, - /* 38 */ Kind::Undetermined, - /* 39 */ Kind::Undetermined, - /* 40 */ Kind::LeftParen, // 0x28 - /* 41 */ Kind::RightParen, // 0x29 - /* 42 */ Kind::Undetermined, - /* 43 */ Kind::Undetermined, - /* 44 */ Kind::Comma, // 0x2C - /* 45 */ Kind::Undetermined, - /* 46 */ Kind::Undetermined, - /* 47 */ Kind::Undetermined, - /* 48 */ Kind::Undetermined, - /* 49 */ Kind::Undetermined, - /* 50 */ Kind::Undetermined, - /* 51 */ Kind::Undetermined, - /* 52 */ Kind::Undetermined, - /* 53 */ Kind::Undetermined, - /* 54 */ Kind::Undetermined, - /* 55 */ Kind::Undetermined, - /* 56 */ Kind::Undetermined, - /* 57 */ Kind::Undetermined, - /* 58 */ Kind::Colon, // 0x3A - /* 59 */ Kind::Semicolon, // 0x3B - /* 60 */ Kind::Undetermined, - /* 61 */ Kind::Undetermined, - /* 62 */ Kind::Undetermined, // 0x3E - /* 63 */ Kind::Undetermined, - /* 64 */ Kind::Undetermined, - /* 65 */ Kind::Undetermined, - /* 66 */ Kind::Undetermined, - /* 67 */ Kind::Undetermined, - /* 68 */ Kind::Undetermined, - /* 69 */ Kind::Undetermined, - /* 70 */ Kind::Undetermined, - /* 71 */ Kind::Undetermined, - /* 72 */ Kind::Undetermined, - /* 73 */ Kind::Undetermined, - /* 74 */ Kind::Undetermined, - /* 75 */ Kind::Undetermined, - /* 76 */ Kind::Undetermined, - /* 77 */ Kind::Undetermined, - /* 78 */ Kind::Undetermined, - /* 79 */ Kind::Undetermined, - /* 80 */ Kind::Undetermined, - /* 81 */ Kind::Undetermined, - /* 82 */ Kind::Undetermined, - /* 83 */ Kind::Undetermined, - /* 84 */ Kind::Undetermined, - /* 85 */ Kind::Undetermined, - /* 86 */ Kind::Undetermined, - /* 87 */ Kind::Undetermined, - /* 88 */ Kind::Undetermined, - /* 89 */ Kind::Undetermined, - /* 90 */ Kind::Undetermined, - /* 91 */ Kind::LeftSquare, // 0x5B - /* 92 */ Kind::Undetermined, - /* 93 */ Kind::RightSquare, // 0x5D - /* 94 */ Kind::Undetermined, - /* 95 */ Kind::Undetermined, - /* 96 */ Kind::Undetermined, - /* 97 */ Kind::Undetermined, - /* 98 */ Kind::Undetermined, - /* 99 */ Kind::Undetermined, - /* 100 */ Kind::Undetermined, - /* 101 */ Kind::Undetermined, - /* 102 */ Kind::Undetermined, - /* 103 */ Kind::Undetermined, - /* 104 */ Kind::Undetermined, - /* 105 */ Kind::Undetermined, - /* 106 */ Kind::Undetermined, - /* 107 */ Kind::Undetermined, - /* 108 */ Kind::Undetermined, - /* 109 */ Kind::Undetermined, - /* 110 */ Kind::Undetermined, - /* 111 */ Kind::Undetermined, - /* 112 */ Kind::Undetermined, - /* 113 */ Kind::Undetermined, - /* 114 */ Kind::Undetermined, - /* 115 */ Kind::Undetermined, - /* 116 */ Kind::Undetermined, - /* 117 */ Kind::Undetermined, - /* 118 */ Kind::Undetermined, - /* 119 */ Kind::Undetermined, - /* 120 */ Kind::Undetermined, - /* 121 */ Kind::Undetermined, - /* 122 */ Kind::Undetermined, - /* 123 */ Kind::LeftCurly, // 0x7B - /* 124 */ Kind::Undetermined, - /* 125 */ Kind::RightCurly, // 0x7D - /* 126 */ Kind::Delim, // 0x7E - /* 127 */ Kind::Undetermined, +pub const SINGLE_CHAR_TOKENS: &[Token; 128] = &[ + /* 0 */ Token::Undetermined, + /* 1 */ Token::Undetermined, + /* 2 */ Token::Undetermined, + /* 3 */ Token::Undetermined, + /* 4 */ Token::Undetermined, + /* 5 */ Token::Undetermined, + /* 6 */ Token::Undetermined, + /* 7 */ Token::Undetermined, + /* 8 */ Token::Undetermined, + /* 9 */ Token::Undetermined, + /* 10 */ Token::Undetermined, + /* 11 */ Token::Undetermined, + /* 12 */ Token::Undetermined, + /* 13 */ Token::Undetermined, + /* 14 */ Token::Undetermined, + /* 15 */ Token::Undetermined, + /* 16 */ Token::Undetermined, + /* 17 */ Token::Undetermined, + /* 18 */ Token::Undetermined, + /* 19 */ Token::Undetermined, + /* 20 */ Token::Undetermined, + /* 21 */ Token::Undetermined, + /* 22 */ Token::Undetermined, + /* 23 */ Token::Undetermined, + /* 24 */ Token::Undetermined, + /* 25 */ Token::Undetermined, + /* 26 */ Token::Undetermined, + /* 27 */ Token::Undetermined, + /* 28 */ Token::Undetermined, + /* 29 */ Token::Undetermined, + /* 30 */ Token::Undetermined, + /* 31 */ Token::Undetermined, + /* 32 */ Token::Undetermined, + /* 33 */ Token::Undetermined, + /* 34 */ Token::Undetermined, + /* 35 */ Token::Undetermined, + /* 36 */ Token::Undetermined, + /* 37 */ Token::Undetermined, + /* 38 */ Token::Undetermined, + /* 39 */ Token::Undetermined, + /* 40 */ Token::LeftParen, // 0x28 + /* 41 */ Token::RightParen, // 0x29 + /* 42 */ Token::Undetermined, + /* 43 */ Token::Undetermined, + /* 44 */ Token::Comma, // 0x2C + /* 45 */ Token::Undetermined, + /* 46 */ Token::Undetermined, + /* 47 */ Token::Undetermined, + /* 48 */ Token::Undetermined, + /* 49 */ Token::Undetermined, + /* 50 */ Token::Undetermined, + /* 51 */ Token::Undetermined, + /* 52 */ Token::Undetermined, + /* 53 */ Token::Undetermined, + /* 54 */ Token::Undetermined, + /* 55 */ Token::Undetermined, + /* 56 */ Token::Undetermined, + /* 57 */ Token::Undetermined, + /* 58 */ Token::Colon, // 0x3A + /* 59 */ Token::Semicolon, // 0x3B + /* 60 */ Token::Undetermined, + /* 61 */ Token::Undetermined, + /* 62 */ Token::Undetermined, // 0x3E + /* 63 */ Token::Undetermined, + /* 64 */ Token::Undetermined, + /* 65 */ Token::Undetermined, + /* 66 */ Token::Undetermined, + /* 67 */ Token::Undetermined, + /* 68 */ Token::Undetermined, + /* 69 */ Token::Undetermined, + /* 70 */ Token::Undetermined, + /* 71 */ Token::Undetermined, + /* 72 */ Token::Undetermined, + /* 73 */ Token::Undetermined, + /* 74 */ Token::Undetermined, + /* 75 */ Token::Undetermined, + /* 76 */ Token::Undetermined, + /* 77 */ Token::Undetermined, + /* 78 */ Token::Undetermined, + /* 79 */ Token::Undetermined, + /* 80 */ Token::Undetermined, + /* 81 */ Token::Undetermined, + /* 82 */ Token::Undetermined, + /* 83 */ Token::Undetermined, + /* 84 */ Token::Undetermined, + /* 85 */ Token::Undetermined, + /* 86 */ Token::Undetermined, + /* 87 */ Token::Undetermined, + /* 88 */ Token::Undetermined, + /* 89 */ Token::Undetermined, + /* 90 */ Token::Undetermined, + /* 91 */ Token::LeftSquare, // 0x5B + /* 92 */ Token::Undetermined, + /* 93 */ Token::RightSquare, // 0x5D + /* 94 */ Token::Undetermined, + /* 95 */ Token::Undetermined, + /* 96 */ Token::Undetermined, + /* 97 */ Token::Undetermined, + /* 98 */ Token::Undetermined, + /* 99 */ Token::Undetermined, + /* 100 */ Token::Undetermined, + /* 101 */ Token::Undetermined, + /* 102 */ Token::Undetermined, + /* 103 */ Token::Undetermined, + /* 104 */ Token::Undetermined, + /* 105 */ Token::Undetermined, + /* 106 */ Token::Undetermined, + /* 107 */ Token::Undetermined, + /* 108 */ Token::Undetermined, + /* 109 */ Token::Undetermined, + /* 110 */ Token::Undetermined, + /* 111 */ Token::Undetermined, + /* 112 */ Token::Undetermined, + /* 113 */ Token::Undetermined, + /* 114 */ Token::Undetermined, + /* 115 */ Token::Undetermined, + /* 116 */ Token::Undetermined, + /* 117 */ Token::Undetermined, + /* 118 */ Token::Undetermined, + /* 119 */ Token::Undetermined, + /* 120 */ Token::Undetermined, + /* 121 */ Token::Undetermined, + /* 122 */ Token::Undetermined, + /* 123 */ Token::LeftCurly, // 0x7B + /* 124 */ Token::Undetermined, + /* 125 */ Token::RightCurly, // 0x7D + /* 126 */ Token::Undetermined, // 0x7E + /* 127 */ Token::Undetermined, ]; diff --git a/crates/hdx_lexer/src/kind.rs b/crates/hdx_lexer/src/kind.rs deleted file mode 100644 index 5a0496fd..00000000 --- a/crates/hdx_lexer/src/kind.rs +++ /dev/null @@ -1,80 +0,0 @@ -//! CSS Token Kinds - -use std::fmt; - -#[cfg(feature = "serde")] -use serde::Serialize; - -#[derive(Debug, Default, Clone, Copy, Eq, PartialEq, Hash)] -#[cfg_attr(feature = "serde", derive(Serialize))] -pub enum Kind { - Undetermined, - #[default] - Eof, - Comment, - Ident, - Function, - AtKeyword, - Hash, - String, - BadString, - Url, - BadUrl, - Delim, - Number, - Percentage, - Dimension, - Whitespace, - Cdo, - Cdc, - Colon, - Semicolon, - Comma, - LeftSquare, - RightSquare, - LeftParen, - RightParen, - LeftCurly, - RightCurly, -} - -#[allow(clippy::enum_glob_use)] -use self::Kind::*; - -impl Kind { - pub fn is_eof(&self) -> bool { - matches!(self, Eof) - } - - pub fn is_trivia(&self) -> bool { - matches!(self, Whitespace | Comment) - } - - pub fn is_numeric(&self) -> bool { - matches!(self, Number | Percentage | Dimension) - } - - pub fn is_function_like(&self) -> bool { - matches!(self, Url | Function) - } - - pub fn is_bad(&self) -> bool { - matches!(self, BadUrl | BadString) - } -} - -impl fmt::Display for Kind { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - write!(f, "{:?}", self) - } -} - -#[cfg(test)] -mod test { - use crate::*; - #[test] - fn display() { - assert_eq!(format!("{}", Kind::Eof), "Eof"); - assert_eq!(format!("{}", Kind::BadString), "BadString"); - } -} diff --git a/crates/hdx_lexer/src/lib.rs b/crates/hdx_lexer/src/lib.rs index 55d45255..525a338f 100644 --- a/crates/hdx_lexer/src/lib.rs +++ b/crates/hdx_lexer/src/lib.rs @@ -1,17 +1,12 @@ mod constants; -mod kind; mod private; -mod span; mod string_builder; mod token; use std::{collections::VecDeque, str::Chars}; use oxc_allocator::Allocator; -pub use span::Span; -pub use token::{PairWise, Token, TokenValue}; - -pub use self::kind::Kind; +pub use token::{Escaped, NumType, PairWise, Token}; #[derive(Debug, Clone)] pub struct LexerCheckpoint<'a> { @@ -38,6 +33,12 @@ impl<'a> Lexer<'a> { self.current.chars.as_str() } + /// Current position in file + #[inline] + pub fn pos(&self) -> u32 { + (self.source.len() - self.remaining().len()) as u32 + } + /// Creates a checkpoint storing the current lexer state. /// Use `rewind` to restore the lexer to the state stored in the checkpoint. pub fn checkpoint(&self) -> LexerCheckpoint<'a> { @@ -71,8 +72,7 @@ impl<'a> Lexer<'a> { self.current.token = Token::default(); for _i in self.lookahead.len()..n { - let kind = self.read_next_token(); - let peeked = self.finish_next(kind); + let peeked = self.read_next_token(); self.lookahead .push_back(LexerCheckpoint { chars: self.current.chars.clone(), token: peeked }); } @@ -96,7 +96,6 @@ impl<'a> Lexer<'a> { self.current.chars = checkpoint.chars; return checkpoint.token; } - let kind = self.read_next_token(); - self.finish_next(kind) + self.read_next_token() } } diff --git a/crates/hdx_lexer/src/private.rs b/crates/hdx_lexer/src/private.rs index 450e719b..1390b229 100644 --- a/crates/hdx_lexer/src/private.rs +++ b/crates/hdx_lexer/src/private.rs @@ -1,4 +1,4 @@ -use hdx_atom::Atom; +use hdx_atom::{atom, Atom}; use hdx_syntax::{ identifier::{is_ident, is_ident_ascii_start, is_ident_start, is_ident_start_sequence}, is_escape_sequence, is_newline, is_quote, is_sign, is_whitespace, @@ -8,241 +8,148 @@ use hdx_syntax::{ use crate::{ constants::{SINGLE_CHAR_TOKENS, SURROGATE_RANGE}, - kind::Kind, string_builder::AutoCow, - token::{Token, TokenValue}, + token::{Escaped, NumType, Token}, Lexer, }; impl<'a> Lexer<'a> { - pub(crate) fn finish_next(&mut self, kind: Kind) -> Token { - self.current.token.kind = kind; - self.current.token.span.end = self.offset(); - debug_assert!(self.current.token.span.start <= self.current.token.span.end); - std::mem::take(&mut self.current.token) - } - - #[inline] - fn offset(&self) -> u32 { - (self.source.len() - self.current.chars.as_str().len()) as u32 - } - - #[inline] - fn peek(&self) -> char { - self.nth(0) - } - #[inline] fn nth(&self, n: usize) -> char { self.current.chars.clone().nth(n).unwrap_or(EOF) } - fn set_kind_and_value(&mut self, kind: Kind, s: &'a str) -> Kind { - self.current.token.kind = kind; - self.current.token.value = match kind { - Kind::BadUrl | Kind::BadString => TokenValue::None, - Kind::Url | Kind::String | Kind::Ident => TokenValue::String(Atom::from(s)), - Kind::Function => TokenValue::String(Atom::from(s)), - Kind::AtKeyword => TokenValue::String(Atom::from(s)), - Kind::Hash => match s { - _ if s.starts_with(is_ident_start) => TokenValue::String(Atom::from(s)), - _ => TokenValue::Unrestricted(Atom::from(s)), - }, - Kind::Number | Kind::Percentage => { - let signed = self.current.token.value.is_signed(); - let int = self.current.token.value.is_int(); - TokenValue::Number { signed, int, value: self.parse_number(s) } - } - Kind::Dimension => { - let signed = self.current.token.value.is_signed(); - let int = self.current.token.value.is_int(); - TokenValue::Dimension { - signed, - int, - value: self.parse_number(s), - unit: Atom::from(""), - } - } - _ => unreachable!(), - }; - kind - } - - fn set_dimension_unit(&mut self, s: &'a str) { - let signed = self.current.token.value.is_signed(); - let int = self.current.token.value.is_int(); - let value = self.current.token.value.as_f32().unwrap(); - self.current.token.value = TokenValue::Dimension { signed, int, value, unit: Atom::from(s) } - } - - pub(crate) fn read_next_token(&mut self) -> Kind { - self.current.token.span.start = self.offset(); - - let offset = self.offset(); - self.current.token.span.start = offset; - let builder = AutoCow::new(self); - if let Some(c) = self.current.chars.next() { - let kind = self.match_char(c, builder); - if kind == Kind::Dimension { - let mut builder = AutoCow::new(self); - self.consume_ident_sequence(&mut builder); - self.set_dimension_unit(builder.finish(self)); - } - kind - } else { - Kind::Eof + pub(crate) fn read_next_token(&mut self) -> Token { + let remaining = self.current.chars.as_str(); + if remaining.is_empty() { + return Token::Eof; } - } - - fn match_char(&mut self, c: char, mut builder: AutoCow<'a>) -> Kind { + let c = self.nth(0); // fast path for single character tokens // '{' '}' '(' ')' '[' ']' ';' ',' ':' let size = c as usize; if size < 128 { - let kind = SINGLE_CHAR_TOKENS[size]; - if kind == Kind::Delim { - return self.consume_delim(c); - } else if kind != Kind::Undetermined { - return kind; + let token = &SINGLE_CHAR_TOKENS[size]; + if token != &Token::Undetermined { + self.current.chars.next(); + return token.clone(); } // fast path for identifiers if is_ident_ascii_start(c) { - builder.push_matching(c); - let kind = self.consume_ident_like_token(&mut builder); - return self.set_kind_and_value(kind, builder.finish(self)); + return self.consume_ident_like_token(); } } match c { // Whitespace Range - c if is_whitespace(c) => { - self.consume_whitespace(); - Kind::Whitespace - } + c if is_whitespace(c) => self.consume_whitespace(), // Quote Range - c if is_quote(c) => self.consume_string_token(c), + c if is_quote(c) => self.consume_string_token(), // Digit Range - c if c.is_ascii_digit() => { - builder.push_matching(c); - let kind = self.consume_numeric_token(c, &mut builder); - return self.set_kind_and_value(kind, builder.finish(self)); - } + c if c.is_ascii_digit() => self.consume_numeric_token(), // Sign Range '-' => { - if self.peek() == '-' && self.nth(1) == '>' { + if self.nth(1) == '-' && self.nth(2) == '>' { self.current.chars.next(); self.current.chars.next(); - return Kind::Cdc; + self.current.chars.next(); + return Token::Cdc; } - if is_ident_start_sequence(c, self.peek(), self.nth(1)) { - builder.push_matching(c); - let kind = self.consume_ident_like_token(&mut builder); - return self.set_kind_and_value(kind, builder.finish(self)); + if is_ident_start_sequence(c, self.nth(1), self.nth(2)) { + return self.consume_ident_like_token(); } if self.is_number_start(c) { - builder.push_matching(c); - let kind = self.consume_numeric_token(c, &mut builder); - return self.set_kind_and_value(kind, builder.finish(self)); + return self.consume_numeric_token(); } - self.consume_delim(c) + Token::Delim(self.current.chars.next().unwrap()) } // Dot or Plus '.' | '+' => { if self.is_number_start(c) { - builder.push_matching(c); - let kind = self.consume_numeric_token(c, &mut builder); - return self.set_kind_and_value(kind, builder.finish(self)); + return self.consume_numeric_token(); } - self.consume_delim(c) + Token::Delim(self.current.chars.next().unwrap()) } // Less Than '<' => { - if self.peek() == '!' && self.nth(1) == '-' && self.nth(2) == '-' { + if self.nth(1) == '!' && self.nth(2) == '-' && self.nth(3) == '-' { + self.current.chars.next(); self.current.chars.next(); self.current.chars.next(); self.current.chars.next(); - return Kind::Cdo; + return Token::Cdo; } - self.consume_delim(c) + Token::Delim(self.current.chars.next().unwrap()) } // Hash / Pound Sign '#' => { - if is_ident(self.peek()) || is_escape_sequence(self.peek(), self.nth(1)) { - let mut builder = AutoCow::new(self); - self.consume_ident_sequence(&mut builder); - return self.set_kind_and_value(Kind::Hash, builder.finish(self)); + if is_ident(self.nth(1)) || is_escape_sequence(self.nth(1), self.nth(2)) { + self.consume_hash_token() + } else { + Token::Delim(self.current.chars.next().unwrap()) } - self.consume_delim(c) } // Commercial At '@' => { - if is_ident_start_sequence(self.peek(), self.nth(1), self.nth(2)) { - let mut builder = AutoCow::new(self); - self.consume_ident_sequence(&mut builder); - return self.set_kind_and_value(Kind::AtKeyword, builder.finish(self)); + if is_ident_start_sequence(self.nth(1), self.nth(2), self.nth(3)) { + let (ident, escaped) = self.consume_ident_sequence(); + return Token::AtKeyword(ident, escaped); } - self.consume_delim(c) + Token::Delim(self.current.chars.next().unwrap()) } // Reverse Solidus '\\' => { - if is_escape_sequence(c, self.peek()) { - builder.force_allocation_without_current_ascii_char(self); - builder.push_different(self.consume_escape_sequence()); - let kind = self.consume_ident_like_token(&mut builder); - return self.set_kind_and_value(kind, builder.finish(self)); + if is_escape_sequence(c, self.nth(1)) { + return self.consume_ident_like_token(); } - self.consume_delim(c) + Token::Delim(self.current.chars.next().unwrap()) } // Solidus - '/' => match self.peek() { + '/' => match self.nth(1) { '*' => { + self.current.chars.next(); self.current.chars.next(); self.consume_comment_token() } - _ => self.consume_delim(c), + _ => Token::Delim(self.current.chars.next().unwrap()), }, c if is_ident_start(c) => { - builder.push_matching(c); - let kind = self.consume_ident_like_token(&mut builder); - self.set_kind_and_value(kind, builder.finish(self)) + return self.consume_ident_like_token(); } - _ => self.consume_delim(c), + _ => Token::Delim(self.current.chars.next().unwrap()), } } - fn consume_delim(&mut self, c: char) -> Kind { - self.current.token.value = TokenValue::Char(c); - Kind::Delim - } - - fn consume_whitespace(&mut self) { + fn consume_whitespace(&mut self) -> Token { loop { - if is_whitespace(self.peek()) { + if is_whitespace(self.nth(0)) { self.current.chars.next(); } else { - return; + return Token::Whitespace; } } } - fn consume_ident_sequence(&mut self, builder: &mut AutoCow<'a>) { + fn consume_ident_sequence(&mut self) -> (Atom, Escaped) { + let mut builder = AutoCow::new(self); + let mut escaped = Escaped::No; loop { - let mut c = self.peek(); + let mut c = self.nth(0); if is_ident(c) { c = self.current.chars.next().unwrap(); builder.push_matching(c); } else if is_escape_sequence(c, self.nth(1)) { + escaped = Escaped::Yes; self.current.chars.next(); builder.force_allocation_without_current_ascii_char(self); builder.push_different(self.consume_escape_sequence()); } else { - return; + return (Atom::from(builder.finish(self)), escaped); } } } fn consume_escape_sequence(&mut self) -> char { - self.current.token.escaped = true; - if !self.peek().is_ascii_hexdigit() { + if !self.nth(0).is_ascii_hexdigit() { let char = self.current.chars.next().unwrap_or(REPLACEMENT); return char; } @@ -252,10 +159,12 @@ impl<'a> Lexer<'a> { REPLACEMENT } - fn consume_url_sequence(&mut self, builder: &mut AutoCow<'a>) -> Kind { + fn consume_url_sequence(&mut self) -> Token { self.consume_whitespace(); + let mut builder = AutoCow::new(self); builder.start = self.remaining(); builder.value = None; + let mut escaped = Escaped::No; loop { let c = self.current.chars.next().unwrap_or(EOF); match c { @@ -288,8 +197,9 @@ impl<'a> Lexer<'a> { return self.consume_remnants_of_bad_url(); } '\\' => { - if is_escape_sequence(c, self.peek()) { + if is_escape_sequence(c, self.nth(0)) { builder.force_allocation_without_current_ascii_char(self); + escaped = Escaped::Yes; let c = self.consume_escape_sequence(); builder.push_different(c); } else { @@ -302,10 +212,10 @@ impl<'a> Lexer<'a> { } } } - Kind::Url + Token::Url(Atom::from(builder.finish(self)), escaped) } - fn consume_remnants_of_bad_url(&mut self) -> Kind { + fn consume_remnants_of_bad_url(&mut self) -> Token { loop { match self.current.chars.next().unwrap_or(EOF) { ')' => { @@ -315,7 +225,7 @@ impl<'a> Lexer<'a> { break; } c @ '\\' => { - if is_escape_sequence(c, self.peek()) { + if is_escape_sequence(c, self.nth(0)) { self.current.chars.next(); self.consume_escape_sequence(); } @@ -323,65 +233,71 @@ impl<'a> Lexer<'a> { _ => {} } } - Kind::BadUrl + Token::BadUrl } - fn consume_numeric_token(&mut self, c: char, builder: &mut AutoCow<'a>) -> Kind { - let (signed, int) = self.consume_number_sequence(c); - match self.peek() { - '%' => { - self.current.chars.next(); - builder.force_allocation_without_current_ascii_char(self); - self.current.token.value = TokenValue::Number { signed, int, value: 0.0 }; - Kind::Percentage - } - c if is_ident_start_sequence(c, self.nth(1), self.nth(2)) => { - self.current.token.value = - TokenValue::Dimension { signed, int, value: 0.0, unit: Atom::from("") }; - Kind::Dimension - } - _ => { - self.current.token.value = TokenValue::Number { signed, int, value: 0.0 }; - Kind::Number - } + fn consume_numeric_token(&mut self) -> Token { + let mut builder = AutoCow::new(self); + let c = self.current.chars.next().unwrap(); + builder.push_matching(c); + let mut num_type = NumType::UnsignedInt; + if is_sign(c) { + num_type = num_type.signed(); + } + if c == '.' { + num_type = num_type.float(); } - } - - fn consume_number_sequence(&mut self, c: char) -> (bool, bool) { - let signed = is_sign(c); - let mut int = c != '.'; self.consume_decimal_digits(); - if int && self.peek() == '.' && self.nth(1).is_ascii_digit() { + if num_type.is_int() && self.nth(0) == '.' && self.nth(1).is_ascii_digit() { self.current.chars.next(); self.consume_decimal_digits(); - int = false; + num_type = num_type.float(); } - if matches!(self.peek(), 'e' | 'E') + if matches!(self.nth(0), 'e' | 'E') && (self.nth(1).is_ascii_digit() || (matches!(self.nth(1), '-' | '+') && self.nth(2).is_ascii_digit())) { self.current.chars.next(); - if matches!(self.peek(), '-' | '+') { + if matches!(self.nth(0), '-' | '+') { self.current.chars.next(); } self.consume_decimal_digits(); - int = false; + num_type = num_type.float(); + } + let value = self.parse_number(builder.finish(self)); + match self.nth(0) { + '%' => { + self.current.chars.next(); + Token::Dimension(num_type, value, atom!("%"), Escaped::No) + } + c if is_ident_start_sequence(c, self.nth(1), self.nth(2)) => { + let (unit, escaped) = self.consume_ident_sequence(); + Token::Dimension(num_type, value, unit, escaped) + } + _ => Token::Number(num_type, value), + } + } + + fn consume_hash_token(&mut self) -> Token { + let (ident, escaped) = self.consume_ident_sequence(); + if ident.starts_with(is_ident_start) { + Token::HashId(ident, escaped) + } else { + Token::Hash(ident, escaped) } - (signed, int) } fn consume_decimal_digits(&mut self) { - while self.peek().is_ascii_digit() { + while self.nth(0).is_ascii_digit() { self.current.chars.next(); } } - fn consume_ident_like_token(&mut self, builder: &mut AutoCow<'a>) -> Kind { - self.consume_ident_sequence(builder); - if self.peek() == '(' { + fn consume_ident_like_token(&mut self) -> Token { + let (ident, escaped) = self.consume_ident_sequence(); + if self.nth(0) == '(' { self.current.chars.next(); - let ident = builder.get_mut_string_without_current_ascii_char(self); - if is_url_ident(ident) { + if is_url_ident(&ident) { let mut chars = self.current.chars.clone(); let mut char = chars.next().unwrap_or(EOF); for _i in 0..=3 { @@ -391,47 +307,52 @@ impl<'a> Lexer<'a> { } if !is_quote(char) { self.consume_whitespace(); - return self.consume_url_sequence(builder); + return self.consume_url_sequence(); } } - return Kind::Function; + return Token::Function(ident, escaped); } - Kind::Ident + Token::Ident(ident, escaped) } - fn consume_string_token(&mut self, delimiter: char) -> Kind { + fn consume_string_token(&mut self) -> Token { + let delimiter = self.current.chars.next().unwrap(); let mut builder = AutoCow::new(self); + let mut escaped = Escaped::No; loop { - match self.peek() { + match self.nth(0) { c if is_newline(c) => { - return Kind::BadString; + return Token::BadString; } EOF => { - return self.set_kind_and_value(Kind::String, builder.finish(self)); + return Token::String(Atom::from(builder.finish(self)), escaped); } c @ ('"' | '\'') => { self.current.chars.next(); if c == delimiter { - return self - .set_kind_and_value(Kind::String, builder.finish_without_push(self)); + return Token::String( + Atom::from(builder.finish_without_push(self)), + escaped, + ); } builder.push_matching(c); } '\\' => { let c = self.current.chars.next().unwrap(); builder.force_allocation_without_current_ascii_char(self); - match self.peek() { + match self.nth(0) { EOF => { - return self.set_kind_and_value(Kind::String, builder.finish(self)); + return Token::String(Atom::from(builder.finish(self)), escaped); } p if is_newline(p) => { self.current.chars.next(); } p if is_escape_sequence(c, p) => { + escaped = Escaped::Yes; builder.push_different(self.consume_escape_sequence()); } _ => { - return Kind::BadString; + return Token::BadString; } } } @@ -443,25 +364,25 @@ impl<'a> Lexer<'a> { } } - fn consume_comment_token(&mut self) -> Kind { + fn consume_comment_token(&mut self) -> Token { while let Some(c) = self.current.chars.next() { - if c == '*' && self.peek() == '/' { + if c == '*' && self.nth(0) == '/' { self.current.chars.next(); - return Kind::Comment; + return Token::Comment; } } - Kind::Comment + Token::Comment } fn is_number_start(&mut self, c: char) -> bool { c.is_ascii_digit() - || (is_sign(c) && self.peek().is_ascii_digit()) - || (is_sign(c) && self.peek() == '.' && self.nth(1).is_ascii_digit()) - || (c == '.' && self.peek().is_ascii_digit()) + || (is_sign(c) && self.nth(0).is_ascii_digit()) + || (is_sign(c) && self.nth(0) == '.' && self.nth(1).is_ascii_digit()) + || (c == '.' && self.nth(0).is_ascii_digit()) } fn hex_digit(&mut self) -> Option { - let value = match self.peek() { + let value = match self.nth(0) { c if c.is_ascii_digit() => c as u32 - '0' as u32, c @ 'a'..='f' => 10 + (c as u32 - 'a' as u32), c @ 'A'..='F' => 10 + (c as u32 - 'A' as u32), @@ -481,7 +402,7 @@ impl<'a> Lexer<'a> { break; } } - if is_whitespace(self.peek()) { + if is_whitespace(self.nth(0)) { self.current.chars.next(); } if value == 0 || SURROGATE_RANGE.contains(&value) { diff --git a/crates/hdx_lexer/src/string_builder.rs b/crates/hdx_lexer/src/string_builder.rs index 5d3fcb12..9cbb3272 100644 --- a/crates/hdx_lexer/src/string_builder.rs +++ b/crates/hdx_lexer/src/string_builder.rs @@ -5,67 +5,58 @@ use bumpalo::collections::String; use crate::Lexer; pub struct AutoCow<'a> { - pub start: &'a str, - pub value: Option>, + pub start: &'a str, + pub value: Option>, } impl<'a> AutoCow<'a> { - pub fn new(lexer: &Lexer<'a>) -> Self { - let start = lexer.remaining(); - AutoCow { start, value: None } - } - - // Push a char that matches lexer.chars().next() - pub fn push_matching(&mut self, c: char) { - if let Some(text) = &mut self.value { - text.push(c); - } - } - - // Push a different character than lexer.chars().next(). - // force_allocation_without_current_ascii_char must be called before this. - pub fn push_different(&mut self, c: char) { - debug_assert!(self.value.is_some()); - self.value.as_mut().unwrap().push(c); - } - - // Force allocation of a String, excluding the current ASCII character, - // and return the reference to it - pub fn get_mut_string_without_current_ascii_char<'b>( - &'b mut self, - lexer: &'_ Lexer<'a>, - ) -> &'b mut String<'a> { - self.force_allocation_without_current_ascii_char(lexer); - self.value.as_mut().unwrap() - } - - // Force allocation of a String, excluding the current ASCII character. - pub fn force_allocation_without_current_ascii_char(&mut self, lexer: &'_ Lexer<'a>) { - if self.value.is_some() { - return; - } - self.value = if self.start.len() == lexer.remaining().len() { - Some(String::from_str_in("", lexer.allocator)) - } else { - Some(String::from_str_in( - &self.start[..self.start.len() - lexer.remaining().len() - 1], - lexer.allocator, - )) - }; - } - - pub fn finish(mut self, lexer: &Lexer<'a>) -> &'a str { - match self.value.take() { - Some(s) => s.into_bump_str(), - None => &self.start[..self.start.len() - lexer.remaining().len()], - } - } - - // Just like finish, but without pushing current char. - pub fn finish_without_push(mut self, lexer: &Lexer<'a>) -> &'a str { - match self.value.take() { - Some(s) => s.into_bump_str(), - None => &self.start[..self.start.len() - lexer.remaining().len() - 1], - } - } + pub fn new(lexer: &Lexer<'a>) -> Self { + let start = lexer.remaining(); + AutoCow { start, value: None } + } + + // Push a char that matches lexer.chars().next() + pub fn push_matching(&mut self, c: char) { + println!("push_matching {}", c); + if let Some(text) = &mut self.value { + text.push(c); + } + } + + // Push a different character than lexer.chars().next(). + // force_allocation_without_current_ascii_char must be called before this. + pub fn push_different(&mut self, c: char) { + debug_assert!(self.value.is_some()); + self.value.as_mut().unwrap().push(c); + } + + // Force allocation of a String, excluding the current ASCII character. + pub fn force_allocation_without_current_ascii_char(&mut self, lexer: &'_ Lexer<'a>) { + if self.value.is_some() { + return; + } + self.value = if self.start.len() == lexer.remaining().len() { + Some(String::from_str_in("", lexer.allocator)) + } else { + Some(String::from_str_in( + &self.start[..self.start.len() - lexer.remaining().len() - 1], + lexer.allocator, + )) + }; + } + + pub fn finish(mut self, lexer: &Lexer<'a>) -> &'a str { + match self.value.take() { + Some(s) => s.into_bump_str(), + None => &self.start[..self.start.len() - lexer.remaining().len()], + } + } + + // Just like finish, but without pushing current char. + pub fn finish_without_push(mut self, lexer: &Lexer<'a>) -> &'a str { + match self.value.take() { + Some(s) => s.into_bump_str(), + None => &self.start[..self.start.len() - lexer.remaining().len() - 1], + } + } } diff --git a/crates/hdx_lexer/src/token.rs b/crates/hdx_lexer/src/token.rs index cd14c55e..d1559255 100644 --- a/crates/hdx_lexer/src/token.rs +++ b/crates/hdx_lexer/src/token.rs @@ -4,50 +4,188 @@ use hdx_atom::Atom; #[cfg(feature = "serde")] use serde::Serialize; -use super::kind::Kind; -use crate::Span; +#[derive(Debug, Copy, Clone, PartialEq, Default, Hash)] +#[cfg_attr(feature = "serde", derive(Serialize))] +pub enum NumType { + #[default] + UnsignedInt, + SignedInt, + UnsignedFloat, + SignedFloat, +} + +impl NumType { + pub(crate) fn is_int(&self) -> bool { + matches!(*self, NumType::UnsignedInt | NumType::SignedInt) + } + + pub(crate) fn signed(&self) -> NumType { + match *self { + NumType::UnsignedInt => NumType::SignedInt, + NumType::UnsignedFloat => NumType::SignedFloat, + x => x, + } + } + + pub(crate) fn float(&self) -> NumType { + match *self { + NumType::UnsignedInt => NumType::UnsignedFloat, + NumType::SignedInt => NumType::SignedFloat, + x => x, + } + } +} #[derive(Debug, Clone, PartialEq, Default, Hash)] #[cfg_attr(feature = "serde", derive(Serialize))] -pub struct Token { - pub kind: Kind, +pub enum Escaped { + #[default] + No, + Yes, +} + +#[derive(Debug, Clone, PartialEq, Default)] +#[cfg_attr(feature = "serde", derive(Serialize))] +pub enum Token { + #[default] + // A token yet to be built + Undetermined, + + // - the end of a file (https://drafts.csswg.org/css-syntax/#typedef-eof-token) + Eof, + + // (https://drafts.csswg.org/css-syntax/#comment-diagram) + Comment, + + // (https://drafts.csswg.org/css-syntax/#ident-token-diagram) + Ident(Atom, Escaped), + + // (https://drafts.csswg.org/css-syntax/#function-token-diagram) + Function(Atom, Escaped), + + // https://drafts.csswg.org/css-syntax/#at-keyword-token-diagram + AtKeyword(Atom, Escaped), + + // "unrestricted" (https://drafts.csswg.org/css-syntax/#hash-token-diagram) + Hash(Atom, Escaped), + + // "id" (https://drafts.csswg.org/css-syntax/#hash-token-diagram) + HashId(Atom, Escaped), + + // (https://drafts.csswg.org/css-syntax/#string-token-diagram) + String(Atom, Escaped), + + // (https://drafts.csswg.org/css-syntax/#typedef-bad-string-token) + BadString, + + // (https://drafts.csswg.org/css-syntax/#url-token-diagram) + Url(Atom, Escaped), + + // (https://drafts.csswg.org/css-syntax/#typedef-bad-url-token) + BadUrl, + + // (https://drafts.csswg.org/css-syntax/#typedef-delim-token) + Delim(char), + + // (https://drafts.csswg.org/css-syntax/#number-token-diagram) + Number(NumType, f32), + + // (https://drafts.csswg.org/css-syntax/#dimension-token-diagram) + Dimension(NumType, f32, Atom, Escaped), + + // (https://drafts.csswg.org/css-syntax/#whitespace-token-diagram) + Whitespace, - #[cfg_attr(feature = "serde", serde(flatten))] - pub span: Span, + // (https://drafts.csswg.org/css-syntax/#CDO-token-diagram) + Cdo, - pub escaped: bool, + // (https://drafts.csswg.org/css-syntax/#CDC-token-diagram) + Cdc, - pub value: TokenValue, + // (https://drafts.csswg.org/css-syntax/#typedef-colon-token) + Colon, + + // (https://drafts.csswg.org/css-syntax/#typedef-semicolon-token) + Semicolon, + + // (https://drafts.csswg.org/css-syntax/#typedef-comma-token) + Comma, + + // <[-token> (https://drafts.csswg.org/css-syntax/#tokendef-open-square) + LeftSquare, + + // <]-token> (https://drafts.csswg.org/css-syntax/#tokendef-close-square) + RightSquare, + + // <(-token> (https://drafts.csswg.org/css-syntax/#tokendef-open-paren) + LeftParen, + + // <)-token> (https://drafts.csswg.org/css-syntax/#tokendef-close-paren) + RightParen, + + // <{-token> (https://drafts.csswg.org/css-syntax/#tokendef-open-curly) + LeftCurly, + + // <}-token> (https://drafts.csswg.org/css-syntax/#tokendef-close-curly) + RightCurly, } impl Token { + #[inline] + pub fn constains_escape(&self) -> bool { + match *self { + Token::Ident(_, Escaped::Yes) + | Token::Function(_, Escaped::Yes) + | Token::AtKeyword(_, Escaped::Yes) + | Token::Hash(_, Escaped::Yes) + | Token::HashId(_, Escaped::Yes) + | Token::String(_, Escaped::Yes) + | Token::BadString + | Token::Url(_, Escaped::Yes) => true, + _ => false, + } + } + #[inline] pub fn is_trivia(&self) -> bool { - self.kind.is_trivia() + matches!(self, Token::Whitespace | Token::Comment) } #[inline] pub fn is_bad(&self) -> bool { - self.kind.is_bad() + matches!(self, Token::BadString | Token::BadUrl) } #[inline] pub fn as_atom(&self) -> Option { - self.value.as_atom() + match self { + Token::Ident(value, _) => Some(value.clone()), + Token::Function(value, _) => Some(value.clone()), + Token::AtKeyword(value, _) => Some(value.clone()), + Token::Hash(value, _) => Some(value.clone()), + Token::HashId(value, _) => Some(value.clone()), + Token::String(value, _) => Some(value.clone()), + Token::Url(value, _) => Some(value.clone()), + _ => None, + } } #[inline] pub fn as_atom_lower(&self) -> Option { - self.value.as_atom_lower() + self.as_atom().map(|s| s.to_ascii_lowercase()) } pub fn matches_ignore_case(&self, str: &Atom) -> bool { - self.value.as_atom().map_or(false, |s| s.eq_ignore_ascii_case(str)) + self.as_atom().map_or(false, |s| s.eq_ignore_ascii_case(str)) + } + + pub fn is_function_like(&self) -> bool { + matches!(self, Token::Url(_, _) | Token::Function(_, _)) } pub fn is_dashed_ident(&self) -> bool { - match self.kind { - Kind::Ident => self.value.as_atom().unwrap().starts_with("--"), + match self { + Token::Ident(value, _) => value.starts_with("--"), _ => false, } } @@ -56,146 +194,164 @@ impl Token { pub fn to_pairwise(&self) -> Option { PairWise::from_token(self) } -} - -#[derive(Debug, Eq, PartialEq, Hash)] -#[cfg_attr(feature = "serde", derive(Serialize))] -pub enum PairWise { - Paren, - Curly, - Square, -} -impl PairWise { - pub fn from_token(token: &Token) -> Option { - match token.kind { - Kind::LeftParen | Kind::Function => Some(Self::Paren), - Kind::LeftCurly => Some(Self::Curly), - Kind::LeftSquare => Some(Self::Square), - Kind::RightParen => Some(Self::Paren), - Kind::RightCurly => Some(Self::Curly), - Kind::RightSquare => Some(Self::Square), - _ => None, - } - } - - pub fn start(&self) -> Kind { + pub fn as_f32(&self) -> Option { match self { - Self::Paren => Kind::LeftParen, - Self::Curly => Kind::LeftCurly, - Self::Square => Kind::LeftSquare, + Self::Number(_, value) => Some(*value), + Self::Dimension(_, value, _, _) => Some(*value), + _ => None, } } - pub fn end(&self) -> Kind { + pub fn as_i32(&self) -> Option { match self { - Self::Paren => Kind::RightParen, - Self::Curly => Kind::RightCurly, - Self::Square => Kind::RightSquare, + Self::Number(_, value) => Some(*value as i32), + Self::Dimension(_, value, _, _) => Some(*value as i32), + _ => None, } } -} -#[derive(Debug, Clone, PartialEq)] -#[cfg_attr(feature = "serde", derive(Serialize), serde(untagged))] -pub enum TokenValue { - None, - String(Atom), - Char(char), - Number { value: f32, signed: bool, int: bool }, - Dimension { value: f32, signed: bool, int: bool, unit: Atom }, - Unrestricted(Atom), -} - -impl Default for TokenValue { - fn default() -> Self { - Self::None - } -} - -impl TokenValue { - pub fn as_f32(&self) -> Option { + pub fn as_char(&self) -> Option { match self { - Self::Number { value, .. } => Some(*value), - Self::Dimension { value, .. } => Some(*value), + Self::Delim(s) => Some(*s), _ => None, } } - pub fn as_i32(&self) -> Option { + pub fn is_signed(&self) -> bool { match self { - Self::Number { value, .. } => Some(*value as i32), - Self::Dimension { value, .. } => Some(*value as i32), - _ => None, + Self::Number(NumType::SignedInt, _) => true, + Self::Number(NumType::SignedFloat, _) => true, + Self::Dimension(NumType::SignedInt, _, _, _) => true, + Self::Dimension(NumType::SignedFloat, _, _, _) => true, + _ => false, } } - pub fn as_atom(&self) -> Option { + pub fn is_int(&self) -> bool { match self { - Self::String(s) => Some(s.into()), - Self::Unrestricted(s) => Some(s.into()), - Self::Dimension { unit, .. } => Some(unit.into()), - _ => None, + Self::Number(NumType::SignedInt, _) => true, + Self::Number(NumType::UnsignedInt, _) => true, + Self::Dimension(NumType::SignedInt, _, _, _) => true, + Self::Dimension(NumType::UnsignedInt, _, _, _) => true, + _ => false, } } +} - pub fn as_atom_lower(&self) -> Option { - self.as_atom().map(|s| s.to_ascii_lowercase()) - } +#[derive(Debug, Eq, PartialEq, Hash)] +#[cfg_attr(feature = "serde", derive(Serialize))] +pub enum PairWise { + Paren, + Curly, + Square, +} - pub fn as_char(&self) -> Option { - match self { - Self::Char(s) => Some(*s), +impl PairWise { + pub fn from_token(token: &Token) -> Option { + match token { + Token::LeftParen | Token::Function(_, _) => Some(Self::Paren), + Token::LeftCurly => Some(Self::Curly), + Token::LeftSquare => Some(Self::Square), + Token::RightParen => Some(Self::Paren), + Token::RightCurly => Some(Self::Curly), + Token::RightSquare => Some(Self::Square), _ => None, } } - pub fn is_signed(&self) -> bool { + pub fn start(&self) -> Token { match self { - Self::Number { signed, .. } => *signed, - Self::Dimension { signed, .. } => *signed, - _ => false, + Self::Paren => Token::LeftParen, + Self::Curly => Token::LeftCurly, + Self::Square => Token::LeftSquare, } } - pub fn is_int(&self) -> bool { + pub fn end(&self) -> Token { match self { - Self::Number { int, .. } => *int, - Self::Dimension { int, .. } => *int, - _ => false, + Self::Paren => Token::RightParen, + Self::Curly => Token::RightCurly, + Self::Square => Token::RightSquare, } } } -impl Hash for TokenValue { +impl Hash for Token { fn hash(&self, state: &mut H) { match self { - Self::None => 0.hash(state), - Self::String(s) => { - 1.hash(state); - s.hash(state); - } - Self::Char(c) => { + Token::Undetermined => {} + Token::Eof => 0.hash(state), + Token::Comment => 1.hash(state), + Token::Ident(a, e) => { 2.hash(state); - c.hash(state); + a.hash(state); + e.hash(state); } - Self::Number { value, signed, int } => { + Token::Function(a, e) => { 3.hash(state); - value.to_bits().hash(state); - signed.hash(state); - int.hash(state); + a.hash(state); + e.hash(state); } - Self::Dimension { value, signed, int, unit } => { + Token::AtKeyword(a, e) => { 4.hash(state); - value.to_bits().hash(state); - signed.hash(state); - int.hash(state); - unit.hash(state); + a.hash(state); + e.hash(state); } - Self::Unrestricted(s) => { + Token::Hash(a, e) => { 5.hash(state); - s.hash(state); + a.hash(state); + e.hash(state); + } + Token::HashId(a, e) => { + 6.hash(state); + a.hash(state); + e.hash(state); + } + Token::String(a, e) => { + 7.hash(state); + a.hash(state); + e.hash(state); + } + Token::BadString => { + 8.hash(state); } - }; + Token::Url(a, e) => { + 9.hash(state); + a.hash(state); + e.hash(state); + } + Token::BadUrl => { + 10.hash(state); + } + Token::Delim(c) => { + 11.hash(state); + c.hash(state); + } + Token::Number(n, f) => { + 12.hash(state); + n.hash(state); + f.to_bits().hash(state); + } + Token::Dimension(n, f, a, e) => { + 13.hash(state); + n.hash(state); + f.to_bits().hash(state); + a.hash(state); + e.hash(state); + } + Token::Whitespace => 14.hash(state), + Token::Cdo => 15.hash(state), + Token::Cdc => 16.hash(state), + Token::Colon => 17.hash(state), + Token::Semicolon => 18.hash(state), + Token::Comma => 19.hash(state), + Token::LeftSquare => 20.hash(state), + Token::RightSquare => 21.hash(state), + Token::LeftParen => 22.hash(state), + Token::RightParen => 23.hash(state), + Token::LeftCurly => 24.hash(state), + Token::RightCurly => 25.hash(state), + } } } diff --git a/crates/hdx_lexer/tests/tests.rs b/crates/hdx_lexer/tests/tests.rs index f3c12aa8..258b8b70 100644 --- a/crates/hdx_lexer/tests/tests.rs +++ b/crates/hdx_lexer/tests/tests.rs @@ -1,88 +1,100 @@ -use hdx_lexer::{Kind, Lexer, Span, Token, TokenValue}; +use hdx_atom::atom; +use hdx_lexer::{Escaped, Lexer, Token}; use oxc_allocator::Allocator; -fn consume_lex<'a>(allocator: &'a Allocator, source: &'a str) -> (Lexer<'a>, Vec) { - let mut lex = Lexer::new(allocator, source); - let mut tokens: Vec = vec![]; - loop { - let token = lex.next_token(); - if token.kind == Kind::Eof { - break; - } - tokens.push(token); - } - (lex, tokens) -} - #[test] fn size_test() { - assert_eq!(::std::mem::size_of::(), 32); - assert_eq!(::std::mem::size_of::(), 16); + assert_eq!(::std::mem::size_of::(), 16); } #[test] -fn smoke_test() { +fn empty() { let allocator = Allocator::default(); - let (_lex, tokens) = consume_lex(&allocator, ""); - assert_eq!(tokens, vec![]); + let mut lex = Lexer::new(&allocator, ""); + assert_eq!(lex.pos(), 0); + assert_eq!(lex.next_token(), Token::Eof); + assert_eq!(lex.pos(), 0); + assert_eq!(lex.next_token(), Token::Eof); + assert_eq!(lex.pos(), 0); } #[test] fn tokenizes_tilde_as_ddelim() { let allocator = Allocator::default(); - let (_lex, tokens) = consume_lex(&allocator, "~"); - assert_eq!( - tokens, - vec![Token { - kind: Kind::Delim, - span: Span::new(0, 1), - escaped: false, - value: TokenValue::Char('~'), - }] - ); + let mut lex = Lexer::new(&allocator, "~"); + assert_eq!(lex.pos(), 0); + assert_eq!(lex.next_token(), Token::Delim('~')); + assert_eq!(lex.pos(), 1); + assert_eq!(lex.next_token(), Token::Eof); + assert_eq!(lex.pos(), 1); + assert_eq!(lex.next_token(), Token::Eof); + assert_eq!(lex.pos(), 1); } #[test] fn tokenizes_newlines_as_whitespace() { let allocator = Allocator::default(); - let (_lex, tokens) = consume_lex(&allocator, "\r\n"); - assert_eq!( - tokens, - vec![Token { - kind: Kind::Whitespace, - span: Span::new(0, 2), - escaped: false, - value: TokenValue::None, - }] - ); + let mut lex = Lexer::new(&allocator, "\r\n"); + assert_eq!(lex.pos(), 0); + assert_eq!(lex.next_token(), Token::Whitespace); + assert_eq!(lex.pos(), 2); + assert_eq!(lex.next_token(), Token::Eof); + assert_eq!(lex.pos(), 2); + assert_eq!(lex.next_token(), Token::Eof); + assert_eq!(lex.pos(), 2); } #[test] fn tokenizes_multiple_newlines_as_whitespace() { let allocator = Allocator::default(); - let (_lex, tokens) = consume_lex(&allocator, "\n\r"); - assert_eq!( - tokens, - vec![Token { - kind: Kind::Whitespace, - span: Span::new(0, 2), - escaped: false, - value: TokenValue::None, - }] - ); + let mut lex = Lexer::new(&allocator, "\r\n"); + assert_eq!(lex.pos(), 0); + assert_eq!(lex.next_token(), Token::Whitespace); + assert_eq!(lex.pos(), 2); + assert_eq!(lex.next_token(), Token::Eof); + assert_eq!(lex.pos(), 2); + assert_eq!(lex.next_token(), Token::Eof); + assert_eq!(lex.pos(), 2); } #[test] fn tokenizes_multiple_whitespace_as_whitespace() { let allocator = Allocator::default(); - let (_lex, tokens) = consume_lex(&allocator, "\t \t \t"); - assert_eq!( - tokens, - vec![Token { - kind: Kind::Whitespace, - span: Span::new(0, 5), - escaped: false, - value: TokenValue::None, - }] - ); + let mut lex = Lexer::new(&allocator, "\t \t \t"); + assert_eq!(lex.pos(), 0); + assert_eq!(lex.next_token(), Token::Whitespace); + assert_eq!(lex.pos(), 5); + assert_eq!(lex.next_token(), Token::Eof); + assert_eq!(lex.pos(), 5); + assert_eq!(lex.next_token(), Token::Eof); + assert_eq!(lex.pos(), 5); +} + +#[test] +fn tokenizes_trivial_css_file() { + let allocator = Allocator::default(); + let mut lex = Lexer::new(&allocator, "body { color: black }"); + assert_eq!(lex.pos(), 0); + assert_eq!(lex.next_token(), Token::Ident(atom!("body"), Escaped::No)); + assert_eq!(lex.pos(), 4); + assert_eq!(lex.next_token(), Token::Whitespace); + assert_eq!(lex.pos(), 5); + assert_eq!(lex.next_token(), Token::LeftCurly); + assert_eq!(lex.pos(), 6); + assert_eq!(lex.next_token(), Token::Whitespace); + assert_eq!(lex.pos(), 7); + assert_eq!(lex.next_token(), Token::Ident(atom!("color"), Escaped::No)); + assert_eq!(lex.pos(), 12); + assert_eq!(lex.next_token(), Token::Colon); + assert_eq!(lex.pos(), 13); + assert_eq!(lex.next_token(), Token::Whitespace); + assert_eq!(lex.pos(), 14); + assert_eq!(lex.next_token(), Token::Ident(atom!("black"), Escaped::No)); + assert_eq!(lex.pos(), 19); + assert_eq!(lex.next_token(), Token::Whitespace); + assert_eq!(lex.pos(), 20); + assert_eq!(lex.next_token(), Token::RightCurly); + assert_eq!(lex.pos(), 21); + assert_eq!(lex.next_token(), Token::Eof); + assert_eq!(lex.pos(), 21); } diff --git a/crates/hdx_parser/src/cursor.rs b/crates/hdx_parser/src/cursor.rs index 4c82049e..5c81a918 100644 --- a/crates/hdx_parser/src/cursor.rs +++ b/crates/hdx_parser/src/cursor.rs @@ -1,13 +1,12 @@ use std::ops::Range; -use hdx_lexer::{Kind, LexerCheckpoint, Span, Token}; +use hdx_lexer::{LexerCheckpoint, Span, Token}; use crate::{diagnostics, Atom, Error, Parser, Result}; pub struct ParserCheckpoint<'a> { lexer: LexerCheckpoint<'a>, token: Token, - prev_span: Span, warnings_pos: usize, errors_pos: usize, } @@ -61,7 +60,7 @@ impl<'a> Parser<'a> { pub(crate) fn expect_ident_of(&mut self, atom: Atom) -> Result<()> { let ident = self.expect_without_advance(Kind::Ident)?.value.as_atom_lower().unwrap(); if atom != ident { - Err(diagnostics::ExpectedIdent(atom, ident, self.cur().span))? + Err(diagnostics::ExpectedIdent(atom, ident, self.pos()))? } self.advance(); Ok(()) @@ -78,7 +77,7 @@ impl<'a> Parser<'a> { pub(crate) fn expect_function_of(&mut self, atom: Atom) -> Result<()> { let ident = self.expect_without_advance(Kind::Function)?.value.as_atom_lower().unwrap(); if atom != ident { - Err(diagnostics::ExpectedFunction(atom, ident, self.cur().span))? + Err(diagnostics::ExpectedFunction(atom, ident, self.pos()))? } self.advance(); Ok(()) @@ -95,7 +94,7 @@ impl<'a> Parser<'a> { pub(crate) fn expect_at_keyword_of(&mut self, atom: Atom) -> Result<()> { let ident = self.expect_without_advance(Kind::AtKeyword)?.value.as_atom_lower().unwrap(); if atom != ident { - Err(diagnostics::ExpectedAtKeyword(atom, ident, self.cur().span))? + Err(diagnostics::ExpectedAtKeyword(atom, ident, self.pos()))? } self.advance(); Ok(()) @@ -125,7 +124,7 @@ impl<'a> Parser<'a> { #[inline] pub(crate) fn expect_delim_of(&mut self, ch: char) -> Result<()> { if ch != self.expect_without_advance(Kind::Delim)?.value.as_char().unwrap() { - Err(diagnostics::UnexpectedDelim(ch, self.cur().span))? + Err(diagnostics::UnexpectedDelim(ch, self.pos()))? } self.advance(); Ok(()) @@ -142,7 +141,7 @@ impl<'a> Parser<'a> { pub(crate) fn expect_number_gte(&mut self, min: f32) -> Result { let n = self.expect_without_advance(Kind::Number)?.value.as_f32().unwrap(); if n < min { - Err(diagnostics::NumberTooSmall(min, self.cur().span))? + Err(diagnostics::NumberTooSmall(min, self.pos()))? } self.advance(); Ok(n) @@ -152,7 +151,7 @@ impl<'a> Parser<'a> { pub(crate) fn expect_number_in_range(&mut self, range: Range) -> Result { let n = self.expect_without_advance(Kind::Number)?.value.as_f32().unwrap(); if !range.contains(&n) { - Err(diagnostics::NumberOutOfBounds(range.start, range.end, self.cur().span))? + Err(diagnostics::NumberOutOfBounds(range.start, range.end, self.pos()))? } self.advance(); Ok(n) @@ -162,7 +161,7 @@ impl<'a> Parser<'a> { pub(crate) fn expect_int(&mut self) -> Result { self.expect_without_advance(Kind::Number)?; if !self.cur().value.is_int() { - Err(diagnostics::DisallowedFloat(self.cur().value.as_i32().unwrap(), self.cur().span))? + Err(diagnostics::DisallowedFloat(self.cur().value.as_i32().unwrap(), self.pos()))? } let n = self.cur().value.as_i32().unwrap(); self.advance(); @@ -180,7 +179,7 @@ impl<'a> Parser<'a> { pub(crate) fn expect_percentage_gte(&mut self, min: f32) -> Result { let n = self.expect_without_advance(Kind::Percentage)?.value.as_f32().unwrap(); if n < min { - Err(diagnostics::NumberTooSmall(min, self.cur().span))? + Err(diagnostics::NumberTooSmall(min, self.pos()))? } self.advance(); Ok(n) @@ -199,7 +198,7 @@ impl<'a> Parser<'a> { let value = &self.expect_without_advance(Kind::Dimension)?.value; let (n, atom) = (value.as_f32().unwrap(), value.as_atom_lower().unwrap()); if n < min { - Err(diagnostics::NumberTooSmall(min, self.cur().span))? + Err(diagnostics::NumberTooSmall(min, self.pos()))? } self.advance(); Ok((n, atom)) @@ -210,7 +209,7 @@ impl<'a> Parser<'a> { let value = &self.expect_without_advance(Kind::Dimension)?.value; let (n, atom) = (value.as_f32().unwrap(), value.as_atom_lower().unwrap()); if !range.contains(&n) { - Err(diagnostics::NumberOutOfBounds(range.start, range.end, self.cur().span))? + Err(diagnostics::NumberOutOfBounds(range.start, range.end, self.pos()))? } self.advance(); Ok((n, atom)) @@ -219,7 +218,7 @@ impl<'a> Parser<'a> { #[inline] pub(crate) fn expect_without_advance(&mut self, kind: Kind) -> Result<&Token> { if !self.at(kind) { - let range = self.cur().span; + let range = self.pos(); Err::<(), Error>(diagnostics::ExpectedToken(kind, self.cur().kind, range).into())?; } Ok(self.cur()) @@ -242,12 +241,10 @@ impl<'a> Parser<'a> { #[inline] pub(crate) fn next_token_include_comments(&mut self) { - self.prev_span = self.token.span; self.token = self.lexer.next_token(); } pub(crate) fn next_token(&mut self) { - self.prev_span = self.token.span; loop { let token = self.lexer.next_token(); if token.kind != Kind::Comment { @@ -258,7 +255,6 @@ impl<'a> Parser<'a> { } pub(crate) fn advance(&mut self) { - self.prev_span = self.token.span; loop { let token = self.lexer.next_token(); if !token.is_trivia() { @@ -276,10 +272,9 @@ impl<'a> Parser<'a> { } pub(crate) fn rewind(&mut self, checkpoint: ParserCheckpoint<'a>) { - let ParserCheckpoint { lexer, token, prev_span, warnings_pos, errors_pos } = checkpoint; + let ParserCheckpoint { lexer, token, warnings_pos, errors_pos } = checkpoint; self.lexer.rewind(lexer); self.token = token; - self.prev_span = prev_span; self.warnings.truncate(warnings_pos); self.errors.truncate(errors_pos); } @@ -288,7 +283,6 @@ impl<'a> Parser<'a> { ParserCheckpoint { lexer: self.lexer.checkpoint(), token: self.token.clone(), - prev_span: self.prev_span, warnings_pos: self.warnings.len(), errors_pos: self.errors.len(), } diff --git a/crates/hdx_parser/src/diagnostics.rs b/crates/hdx_parser/src/diagnostics.rs index fa6c6105..6528657a 100644 --- a/crates/hdx_parser/src/diagnostics.rs +++ b/crates/hdx_parser/src/diagnostics.rs @@ -1,7 +1,7 @@ use miette::{self, Diagnostic}; use thiserror::{self, Error}; -use crate::{Atom, Kind, Span}; +use crate::{Atom, Span, Token}; #[derive(Debug, Error, Diagnostic)] #[error("The token as {0} cannot yet be parsed by the parser :(")] @@ -22,7 +22,7 @@ pub struct BadDeclaration(#[label("This is not valid syntax for a declaration.") #[derive(Debug, Error, Diagnostic)] #[error("Unexpected `{0}`")] #[diagnostic(help("This is not correct CSS syntax."), code(hdx_parser::Unexpected))] -pub struct Unexpected(pub Kind, #[label("This wasn't expected here")] pub Span); +pub struct Unexpected(pub Token, #[label("This wasn't expected here")] pub Span); #[derive(Debug, Error, Diagnostic)] #[error("Unexpected identifier '{0}'")] @@ -120,7 +120,7 @@ pub struct ExpectedEnd(#[label("All of this extra content was ignored.")] pub Sp #[derive(Debug, Error, Diagnostic)] #[error("Expected `{0}` but found `{1}` {2}")] #[diagnostic(help("This is not correct CSS syntax."), code(hdx_parser::ExpectedToken))] -pub struct ExpectedToken(pub Kind, pub Kind, #[label("`{0}` expected")] pub Span); +pub struct ExpectedToken(pub Token, pub Token, #[label("`{0}` expected")] pub Span); #[derive(Debug, Error, Diagnostic)] #[error("Expected the identifier `{0}` but found `{1}`")] @@ -143,7 +143,7 @@ pub struct ExpectedAtKeyword(pub Atom, pub Atom, #[label("This at-keyword")] pub help("Try removing the trailing {0} which will remove this warning."), code(hdx_parser::WarnTrailing) )] -pub struct WarnTrailing(pub Kind, #[label("This can be removed")] pub Span); +pub struct WarnTrailing(pub Token, #[label("This can be removed")] pub Span); #[derive(Debug, Error, Diagnostic)] #[error("Invalid hexidecimal value for color: '{0}'")] diff --git a/crates/hdx_parser/src/lib.rs b/crates/hdx_parser/src/lib.rs index 8ec7b156..fdbf5f87 100644 --- a/crates/hdx_parser/src/lib.rs +++ b/crates/hdx_parser/src/lib.rs @@ -141,7 +141,9 @@ impl<'a> Parser<'a> { } } if last_kind == Kind::Comma { - let warn: Error = diagnostics::WarnTrailing(self.cur().kind, self.cur().span).into(); + let warn: Error = + diagnostics::WarnTrailing(self.cur().kind, Span::from(self.pos() - 1, self.pos())) + .into(); if !self.sloppy { Err(warn)?; }