Skip to content

Commit

Permalink
fix(jruby): serialize entity refs or the replacement text, not both (#…
Browse files Browse the repository at this point in the history
…3272)

**What problem is this PR intended to solve?**

Two things getting fixed here:

1. We were not calling `setFeature(FEATURE_NOT_EXPAND_ENTITY, ...)`
correctly. It defaults to true, and we were conditionally setting it to
true. Instead, let's just explicitly set this feature (and the other
features we care about) to avoid mistaken assumptions about the default.

2. We were rendering the children of the EntityReference, which contains
the replacement text, as well as the EntityReference itself. We should
only ever render one or the other. If NOENT is false, though, there
won't be any EntityReferences in the DOM. So: if we encounter an
EntityReference, don't render its children.

See https://xerces.apache.org/xerces-j/features.html section on
`create-entity-ref-nodes` for a deeper explanation of the parser
behavior.

Closes #3270


**Have you included adequate test coverage?**

Yes


**Does this change affect the behavior of either the C or the Java
implementations?**

This fixes a bug in the Java implementation, so it behaves like the C
impl.
  • Loading branch information
flavorjones authored Jul 3, 2024
2 parents 043e1e9 + aad5c90 commit 0e9cf15
Show file tree
Hide file tree
Showing 4 changed files with 96 additions and 30 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ Nokogiri follows [Semantic Versioning](https://semver.org/), please see the [REA
* [CRuby] Update node GC lifecycle to avoid a potential memory leak with fragments in libxml 2.13.0 caused by changes in `xmlAddChild`. [#3156] @flavorjones
* [CRuby] libgumbo correctly prints nonstandard element names in error messages. [#3219] @stevecheckoway
* [CRuby] SAX parsing no longer registers errors when encountering external entity references. [#1926] @flavorjones
* [JRuby] Fixed entity reference serialization, which rendered both the reference and the replacement text. Incredibly nobody noticed this bug for over a decade. [#3272] @flavorjones
* [JRuby] Fixed some bugs in how `Node#attributes` handles attributes with namespaces. [#2677, #2679] @flavorjones
* [JRuby] Fix `Schema#validate` to only return the most recent Document's errors. Previously, if multiple documents were validated, this method returned the accumulated errors of all previous documents. [#1282] @flavorjones
* [JRuby] Fix `Schema#validate` to not clobber the `@errors` instance variable. [#1282] @flavorjones
Expand Down
28 changes: 16 additions & 12 deletions ext/java/nokogiri/XmlEntityReference.java
Original file line number Diff line number Diff line change
Expand Up @@ -61,19 +61,23 @@ public class XmlEntityReference extends XmlNode
public void
accept(ThreadContext context, SaveContextVisitor visitor)
{
//
// Note that when noEnt is set, we call setFeature(FEATURE_NOT_EXPAND_ENTITY, false) in
// XmlDomParserContext.
//
// See https://xerces.apache.org/xerces-j/features.html section on `create-entity-ref-nodes`
//
// When set to true (the default), then EntityReference nodes are present in the DOM tree, and
// its children represent the replacement text. When set to false, then the EntityReference is
// not present in the tree, and instead the replacement text nodes are present.
//
// So: if we are here, then noEnt must be true, and we should just serialize the EntityReference
// and not worry about the replacement text. When noEnt is false, we would never this and
// instead would be serializing the replacement text.
//
// https://github.com/sparklemotion/nokogiri/issues/3270
//
visitor.enter(node);
Node child = node.getFirstChild();
while (child != null) {
IRubyObject nokoNode = getCachedNodeOrCreate(context.getRuntime(), child);
if (nokoNode instanceof XmlNode) {
XmlNode cur = (XmlNode) nokoNode;
cur.accept(context, visitor);
} else if (nokoNode instanceof XmlNamespace) {
XmlNamespace cur = (XmlNamespace) nokoNode;
cur.accept(context, visitor);
}
child = child.getNextSibling();
}
visitor.leave(node);
}
}
25 changes: 7 additions & 18 deletions ext/java/nokogiri/internals/XmlDomParserContext.java
Original file line number Diff line number Diff line change
Expand Up @@ -91,28 +91,17 @@ public class XmlDomParserContext extends ParserContext
// Fix for Issue#586. This limits entity expansion up to 100000 and nodes up to 3000.
setProperty(SECURITY_MANAGER, new org.apache.xerces.util.SecurityManager());

if (options.noBlanks) {
setFeature(FEATURE_INCLUDE_IGNORABLE_WHITESPACE, false);
}

if (options.recover) {
setFeature(CONTINUE_AFTER_FATAL_ERROR, true);
}
setFeature(FEATURE_INCLUDE_IGNORABLE_WHITESPACE, !options.noBlanks);
setFeature(CONTINUE_AFTER_FATAL_ERROR, options.recover);
setFeature(FEATURE_VALIDATION, options.dtdValid);
setFeature(FEATURE_NOT_EXPAND_ENTITY, !options.noEnt);

if (options.dtdValid) {
setFeature(FEATURE_VALIDATION, true);
}

if (!options.noEnt) {
setFeature(FEATURE_NOT_EXPAND_ENTITY, true);
}
// If we turn off loading of external DTDs complete, we don't
// get the publicID. Instead of turning off completely, we use
// an entity resolver that returns empty documents.
if (options.dtdLoad) {
setFeature(FEATURE_LOAD_EXTERNAL_DTD, true);
setFeature(FEATURE_LOAD_DTD_GRAMMAR, true);
}
setFeature(FEATURE_LOAD_EXTERNAL_DTD, options.dtdLoad);
setFeature(FEATURE_LOAD_DTD_GRAMMAR, options.dtdLoad);

parser.setEntityResolver(new NokogiriEntityResolver(runtime, errorHandler, options));
}

Expand Down
72 changes: 72 additions & 0 deletions test/xml/test_entity_reference.rb
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,78 @@ def test_children_should_always_be_empty

entity.inspect # should not segfault
end

def test_serialization_of_local_entities_without_noent
xml = <<~XML
<?xml version="1.0" encoding="UTF-8" ?>
<!DOCTYPE test [ <!ENTITY quux "expansion"> ]>
<test>&quux;</test>
XML

doc = Nokogiri::XML(xml)
assert_equal("<test>&quux;</test>", doc.root.to_xml)
end

def test_serialization_of_local_entities_with_noent
xml = <<~XML
<?xml version="1.0" encoding="UTF-8" ?>
<!DOCTYPE test [ <!ENTITY quux "expansion"> ]>
<test>&quux;</test>
XML

doc = Nokogiri::XML(xml) { |cfg| cfg.noent }
assert_equal("<test>expansion</test>", doc.root.to_xml)
end

def test_serialization_of_undeclared_entities_without_noent
if Nokogiri.uses_libxml?("< 2.13.0") # gnome/libxml2@45fe9924
skip("libxml2 version under test is inconsistent in handling undeclared entities")
end

xml = <<~XML
<?xml version="1.0" encoding="UTF-8" ?>
<test>&quux;</test>
XML

doc = Nokogiri::XML(xml)
assert_equal("<test>&quux;</test>", doc.root.to_xml)
end

def test_serialization_of_undeclared_entities_with_noent
xml = <<~XML
<?xml version="1.0" encoding="UTF-8" ?>
<test>&quux;</test>
XML

doc = Nokogiri::XML(xml) { |cfg| cfg.noent }
assert_equal("<test/>", doc.root.to_xml)
end

def test_serialization_of_unresolved_entities_without_noent
xml = <<~XML
<?xml version="1.0" encoding="UTF-8" ?>
<!DOCTYPE test [
<!ENTITY quux SYSTEM "http://0.0.0.0:8080/not-resolved.dtd">
]>
<test>&quux;</test>
XML

doc = Nokogiri::XML(xml)
assert_equal("<test>&quux;</test>", doc.root.to_xml)
end

def test_serialization_of_unresolved_entities_with_noent
xml = <<~XML
<?xml version="1.0" encoding="UTF-8" ?>
<!DOCTYPE test [
<!ENTITY quux SYSTEM "http://0.0.0.0:8080/not-resolved.dtd">
]>
<test>&quux;</test>
XML

doc = Nokogiri::XML(xml) { |cfg| cfg.noent }
assert_equal("<test/>", doc.root.to_xml)
end
end

module Common
Expand Down

0 comments on commit 0e9cf15

Please sign in to comment.