Skip to content

Commit

Permalink
support decimal and hexadecimal character entities (e.g. "
" or "…
Browse files Browse the repository at this point in the history
…
")

The parser already suported named character entities, but support for arbitrary characters was missing.
When writing files, only \r will be replaced by 
 in addition to the normal named entities.
  • Loading branch information
DanielT committed Jun 28, 2023
1 parent d84b71e commit 35241f7
Show file tree
Hide file tree
Showing 2 changed files with 52 additions and 5 deletions.
7 changes: 4 additions & 3 deletions autosar-data/src/chardata.rs
Original file line number Diff line number Diff line change
Expand Up @@ -190,6 +190,7 @@ fn escape_text(input: &str) -> Cow<str> {
'&' => escaped.push_str("&amp;"),
'"' => escaped.push_str("&quot;"),
'\'' => escaped.push_str("&apos;"),
'\r' => escaped.push_str("&#13;"), // this could get messed up by git, if an arxml file is checked in on windows
other => escaped.push(other),
}
}
Expand Down Expand Up @@ -386,10 +387,10 @@ mod test {
assert_eq!(format!("{data}"), "text");

let mut out = "".to_string();
let data = CharacterData::String("special chars: <, >, &, \', \"".to_string());
let data = CharacterData::String("special chars: <, >, &, \', \", \r".to_string());
data.serialize_internal(&mut out);
assert_eq!(out, "special chars: &lt;, &gt;, &amp;, &apos;, &quot;");
assert_eq!(format!("{data}"), "special chars: <, >, &, \', \"");
assert_eq!(out, "special chars: &lt;, &gt;, &amp;, &apos;, &quot;, &#13;");
assert_eq!(format!("{data}"), "special chars: <, >, &, \', \", \r");
}

#[test]
Expand Down
50 changes: 48 additions & 2 deletions autosar-data/src/parser.rs
Original file line number Diff line number Diff line change
Expand Up @@ -682,6 +682,46 @@ impl<'a> ArxmlParser<'a> {
} else if rem.starts_with("&quot;") {
unescaped.push('"');
rem = &rem[6..];
} else if rem.starts_with("&#x") {
// hexadecimal character reference
let mut valid = false;
if let Some(endpos) = rem.find(';') {
let hextxt = &rem[3..endpos];
if let Ok(hexval) = u32::from_str_radix(hextxt, 16) {
if let Some(ch) = char::from_u32(hexval) {
unescaped.push(ch);
rem = &rem[endpos+1..];
valid = true;
}
}
}
if !valid {
self.optional_error(ArxmlParserError::InvalidXmlEntity {
input: input.to_owned(),
})?;
unescaped.push('&');
rem = &rem[1..];
}
} else if rem.starts_with("&#") {
// decimal character reference
let mut valid = false;
if let Some(endpos) = rem.find(';') {
let numtxt = &rem[2..endpos];
if let Ok(val) = u32::from_str(numtxt) {
if let Some(ch) = char::from_u32(val) {
unescaped.push(ch);
rem = &rem[endpos+1..];
valid = true;
}
}
}
if !valid {
self.optional_error(ArxmlParserError::InvalidXmlEntity {
input: input.to_owned(),
})?;
unescaped.push('&');
rem = &rem[1..];
}
} else {
self.optional_error(ArxmlParserError::InvalidXmlEntity {
input: input.to_owned(),
Expand Down Expand Up @@ -1131,11 +1171,17 @@ mod test {
fn unescape_entities() {
let mut parser = ArxmlParser::new(PathBuf::from("test_buffer.arxml"), &[], true);
let result = parser
.unescape_string("&amp;&amp;&lt;FOO&gt;&quot;&quot;&apos;end")
.unescape_string("&amp;&amp;&lt;FOO&gt;&quot;&quot;&apos;&#32;&#x20;end")
.unwrap();
assert_eq!(&result, r#"&&<FOO>""'end"#);
assert_eq!(&result, r#"&&<FOO>""' end"#);
let result = parser.unescape_string("&amp;&amp;&gt;FOO&lt;&quot&quot;&apos;end");
assert!(result.is_err());
// numeric character entity does not accept hex values
let result = parser.unescape_string("&#abcde;");
assert!(result.is_err());
// values from 0x110000 to 0x1FFFFF are not valid unicode code points -> 0x110000 = 1114112
let result = parser.unescape_string("&#1114112;");
assert!(result.is_err());
}

const PARSER_TEST_DATA: &str = r#"<?xml version="1.0" encoding="utf-8"?>
Expand Down

0 comments on commit 35241f7

Please sign in to comment.