Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions changelog.d/0000.feature.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Add support for `\u{...}` Unicode escape sequences in VRL string literals.
143 changes: 129 additions & 14 deletions src/parser/lex.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1276,6 +1276,61 @@ impl<'input> Lexer<'input> {
fn escape_code(&mut self, start: usize) -> Result<(), Error> {
match self.bump() {
Some((_, '\n' | '\'' | '"' | '\\' | 'n' | 'r' | 't' | '{' | '}' | '0')) => Ok(()),
Some((_, 'u')) => {
let Some((_, '{')) = self.bump() else {
return Err(Error::EscapeChar {
start,
ch: Some('u'),
});
};

let mut digits = String::new();
loop {
match self.peek() {
Some((_, '}')) => {
self.bump();
break;
}
Some((pos, ch)) if ch.is_ascii_hexdigit() => {
if digits.len() >= 6 {
return Err(Error::EscapeChar {
start: pos,
ch: Some(ch),
});
}
digits.push(ch);
self.bump();
}
Some((pos, ch)) => {
return Err(Error::EscapeChar {
start: pos,
ch: Some(ch),
});
}
None => {
return Err(Error::EscapeChar { start, ch: None });
}
}
}

if digits.is_empty() {
return Err(Error::EscapeChar {
start,
ch: Some('u'),
});
}

match u32::from_str_radix(&digits, 16)
.ok()
.and_then(char::from_u32)
{
Some(_) => Ok(()),
None => Err(Error::EscapeChar {
start,
ch: Some('u'),
}),
}
}
Some((start, ch)) => Err(Error::EscapeChar {
start,
ch: Some(ch),
Expand Down Expand Up @@ -1321,6 +1376,11 @@ pub(crate) fn is_operator(ch: char) -> bool {
fn unescape_string_literal(mut s: &str) -> String {
let mut string = String::with_capacity(s.len());
while let Some(i) = s.bytes().position(|b| b == b'\\') {
if i + 1 >= s.len() {
string.push_str(s);
return string;
}

let next = s.as_bytes()[i + 1];
if next == b'\n' {
// Remove the \n and any ensuing spaces or tabs
Expand All @@ -1332,23 +1392,63 @@ fn unescape_string_literal(mut s: &str) -> String {
.map(char::len_utf8)
.sum();
s = &s[i + whitespace + 2..];
} else {
let c = match next {
b'\'' => '\'',
b'"' => '"',
b'\\' => '\\',
b'n' => '\n',
b'r' => '\r',
b't' => '\t',
b'0' => '\0',
b'{' => '{',
_ => unimplemented!("invalid escape"),
};
continue;
}

string.push_str(&s[..i]);
if next == b'u' && s.as_bytes().get(i + 2) == Some(&b'{') {
let mut end = i + 3;
let mut digit_count = 0;
let mut valid = true;

while end < s.len() {
let b = s.as_bytes()[end];
if b == b'}' {
break;
}
if !(b as char).is_ascii_hexdigit() {
valid = false;
break;
}
digit_count += 1;
if digit_count > 6 {
valid = false;
break;
}
end += 1;
}

if valid && digit_count > 0 && end < s.len() && s.as_bytes()[end] == b'}' {
if let Ok(value) = u32::from_str_radix(&s[i + 3..end], 16) {
if let Some(value) = char::from_u32(value) {
string.push_str(&s[..i]);
string.push(value);
s = &s[end + 1..];
continue;
}
}
}
}

let (handled, c) = match next {
b'\'' => (true, '\''),
b'"' => (true, '"'),
b'\\' => (true, '\\'),
b'n' => (true, '\n'),
b'r' => (true, '\r'),
b't' => (true, '\t'),
b'0' => (true, '\0'),
b'{' => (true, '{'),
_ => (false, '\0'),
};

string.push_str(&s[..i]);
if handled {
string.push(c);
s = &s[i + 2..];
} else {
string.push('\\');
string.push(next as char);
}
s = &s[i + 2..];
}

string.push_str(s);
Expand Down Expand Up @@ -1532,6 +1632,21 @@ mod test {
assert_eq!(TemplateString(vec![StringSegment::Literal(r#""""#.to_string(), Span::new(1, 5))]), StringLiteralToken(r#"\"\""#).template(Span::new(0, 6)));
}

#[test]
fn string_literal_unicode_escape() {
use StringLiteralToken as S;
use StringLiteral as L;

test(
data(r#""\u{7FFF}""#),
vec![
("~~~~~~~~~~", L(S("\\u{7FFF}"))),
],
);

assert_eq!(StringLiteralToken(r#"\u{7FFF}"#).unescape(), "\u{7FFF}");
}

#[test]
fn multiline_string_literals() {
let mut lexer = lexer(
Expand Down
Loading