From a34be67344f70647c76f9e548223294f6584db3d Mon Sep 17 00:00:00 2001 From: Liam Date: Thu, 21 Nov 2024 15:11:10 -0500 Subject: [PATCH 1/2] Add character set macro and use in place of Lazy --- src/character_set.rs | 18 ++++++++++++++++++ src/html.rs | 16 ++++++---------- src/lib.rs | 1 + src/parser/autolink.rs | 26 ++++---------------------- src/xml.rs | 12 +++--------- 5 files changed, 32 insertions(+), 41 deletions(-) create mode 100644 src/character_set.rs diff --git a/src/character_set.rs b/src/character_set.rs new file mode 100644 index 00000000..143af902 --- /dev/null +++ b/src/character_set.rs @@ -0,0 +1,18 @@ +macro_rules! character_set { + () => {{ + [false; 256] + }}; + + ($value:literal $(,$rest:literal)*) => {{ + const A: &[u8] = $value; + let mut a = character_set!($($rest),*); + let mut i = 0; + while i < A.len() { + a[A[i] as usize] = true; + i += 1; + } + a + }} +} + +pub(crate) use character_set; diff --git a/src/html.rs b/src/html.rs index 72fd1f0d..2cad4b90 100644 --- a/src/html.rs +++ b/src/html.rs @@ -1,4 +1,5 @@ //! The HTML renderer for the CommonMark AST, as well as helper functions. +use crate::character_set::character_set; use crate::ctype::isspace; use crate::nodes::{ AstNode, ListType, NodeCode, NodeFootnoteDefinition, NodeMath, NodeTable, NodeValue, @@ -297,16 +298,11 @@ pub fn escape(output: &mut dyn Write, buffer: &[u8]) -> io::Result<()> { /// the string "a b", rather than "?q=a%2520b", a search for the literal /// string "a%20b". pub fn escape_href(output: &mut dyn Write, buffer: &[u8]) -> io::Result<()> { - static HREF_SAFE: Lazy<[bool; 256]> = Lazy::new(|| { - let mut a = [false; 256]; - for &c in b"-_.+!*(),%#@?=;:/,+$~abcdefghijklmnopqrstuvwxyz".iter() { - a[c as usize] = true; - } - for &c in b"ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789".iter() { - a[c as usize] = true; - } - a - }); + const HREF_SAFE: [bool; 256] = character_set!( + b"-_.+!*(),%#@?=;:/,+$~", + b"abcdefghijklmnopqrstuvwxyz", + b"ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789" + ); let size = buffer.len(); let mut i = 0; diff --git a/src/lib.rs b/src/lib.rs index 3abdb656..3a2ba0ef 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -67,6 +67,7 @@ use std::io::BufWriter; pub mod adapters; pub mod arena_tree; +mod character_set; mod cm; mod ctype; mod entity; diff --git a/src/parser/autolink.rs b/src/parser/autolink.rs index ca377f35..456f9293 100644 --- a/src/parser/autolink.rs +++ b/src/parser/autolink.rs @@ -1,7 +1,7 @@ +use crate::character_set::character_set; use crate::ctype::{isalnum, isalpha, isspace}; use crate::nodes::{AstNode, NodeLink, NodeValue}; use crate::parser::inlines::make_inline; -use once_cell::sync::Lazy; use std::str; use typed_arena::Arena; use unicode_categories::UnicodeCategories; @@ -74,13 +74,7 @@ pub fn www_match<'a>( i: usize, relaxed_autolinks: bool, ) -> Option<(&'a AstNode<'a>, usize, usize)> { - static WWW_DELIMS: Lazy<[bool; 256]> = Lazy::new(|| { - let mut sc = [false; 256]; - for c in &[b'*', b'_', b'~', b'(', b'['] { - sc[*c as usize] = true; - } - sc - }); + const WWW_DELIMS: [bool; 256] = character_set!(b"*_~(["); if i > 0 && !isspace(contents[i - 1]) && !WWW_DELIMS[contents[i - 1] as usize] { return None; @@ -167,13 +161,7 @@ fn is_valid_hostchar(ch: char) -> bool { } fn autolink_delim(data: &[u8], mut link_end: usize, relaxed_autolinks: bool) -> usize { - static LINK_END_ASSORTMENT: Lazy<[bool; 256]> = Lazy::new(|| { - let mut sc = [false; 256]; - for c in &[b'?', b'!', b'.', b',', b':', b'*', b'_', b'~', b'\'', b'"'] { - sc[*c as usize] = true; - } - sc - }); + const LINK_END_ASSORTMENT: [bool; 256] = character_set!(b"?!.,:*_~'\""); for (i, &b) in data.iter().enumerate().take(link_end) { if b == b'<' { @@ -311,13 +299,7 @@ fn email_match<'a>( i: usize, relaxed_autolinks: bool, ) -> Option<(&'a AstNode<'a>, usize, usize)> { - static EMAIL_OK_SET: Lazy<[bool; 256]> = Lazy::new(|| { - let mut sc = [false; 256]; - for c in &[b'.', b'+', b'-', b'_'] { - sc[*c as usize] = true; - } - sc - }); + const EMAIL_OK_SET: [bool; 256] = character_set!(b".+-_"); let size = contents.len(); diff --git a/src/xml.rs b/src/xml.rs index 0dceabf3..3032efbd 100644 --- a/src/xml.rs +++ b/src/xml.rs @@ -1,6 +1,6 @@ +use crate::character_set::character_set; use crate::nodes::{AstNode, ListType, NodeCode, NodeMath, NodeTable, NodeValue}; use crate::parser::{Options, Plugins}; -use once_cell::sync::Lazy; use std::cmp; use std::io::{self, Write}; @@ -48,17 +48,11 @@ impl<'o> XmlFormatter<'o> { } fn escape(&mut self, buffer: &[u8]) -> io::Result<()> { - static XML_SAFE: Lazy<[bool; 256]> = Lazy::new(|| { - let mut a = [true; 256]; - for &c in b"&<>\"".iter() { - a[c as usize] = false; - } - a - }); + const XML_UNSAFE: [bool; 256] = character_set!(b"&<>\""); let mut offset = 0; for (i, &byte) in buffer.iter().enumerate() { - if !XML_SAFE[byte as usize] { + if XML_UNSAFE[byte as usize] { let esc: &[u8] = match byte { b'"' => b""", b'&' => b"&", From eebca8568513deb242c5c67de6547802a488d1a6 Mon Sep 17 00:00:00 2001 From: Liam Date: Thu, 21 Nov 2024 15:15:27 -0500 Subject: [PATCH 2/2] Replace NEEDS_ESCAPED array with character_set macro --- src/html.rs | 40 +++------------------------------------- 1 file changed, 3 insertions(+), 37 deletions(-) diff --git a/src/html.rs b/src/html.rs index 2cad4b90..eb542c5d 100644 --- a/src/html.rs +++ b/src/html.rs @@ -136,42 +136,6 @@ struct HtmlFormatter<'o> { plugins: &'o Plugins<'o>, } -#[rustfmt::skip] -const NEEDS_ESCAPED : [bool; 256] = [ - false, false, false, false, false, false, false, false, - false, false, false, false, false, false, false, false, - false, false, false, false, false, false, false, false, - false, false, false, false, false, false, false, false, - false, false, true, false, false, false, true, false, - false, false, false, false, false, false, false, false, - false, false, false, false, false, false, false, false, - false, false, false, false, true, false, true, false, - false, false, false, false, false, false, false, false, - false, false, false, false, false, false, false, false, - false, false, false, false, false, false, false, false, - false, false, false, false, false, false, false, false, - false, false, false, false, false, false, false, false, - false, false, false, false, false, false, false, false, - false, false, false, false, false, false, false, false, - false, false, false, false, false, false, false, false, - false, false, false, false, false, false, false, false, - false, false, false, false, false, false, false, false, - false, false, false, false, false, false, false, false, - false, false, false, false, false, false, false, false, - false, false, false, false, false, false, false, false, - false, false, false, false, false, false, false, false, - false, false, false, false, false, false, false, false, - false, false, false, false, false, false, false, false, - false, false, false, false, false, false, false, false, - false, false, false, false, false, false, false, false, - false, false, false, false, false, false, false, false, - false, false, false, false, false, false, false, false, - false, false, false, false, false, false, false, false, - false, false, false, false, false, false, false, false, - false, false, false, false, false, false, false, false, - false, false, false, false, false, false, false, false, -]; - fn tagfilter(literal: &[u8]) -> bool { static TAGFILTER_BLACKLIST: [&str; 9] = [ "title", @@ -255,9 +219,11 @@ fn dangerous_url(input: &[u8]) -> bool { /// Note that this is appropriate and sufficient for free text, but not for /// URLs in attributes. See escape_href. pub fn escape(output: &mut dyn Write, buffer: &[u8]) -> io::Result<()> { + const HTML_UNSAFE: [bool; 256] = character_set!(b"&<>\""); + let mut offset = 0; for (i, &byte) in buffer.iter().enumerate() { - if NEEDS_ESCAPED[byte as usize] { + if HTML_UNSAFE[byte as usize] { let esc: &[u8] = match byte { b'"' => b""", b'&' => b"&",