sparklemotion · flavorjones · Aug 5, 2024 · Aug 1, 2024 · Aug 5, 2024
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -73,6 +73,7 @@ We've resolved many long-standing bugs in the various schema classes, validation
 * CSS queries for pseudo-selectors that cannot be translated into XPath expressions now raise a more descriptive `Nokogiri::CSS::SyntaxError` when they are parsed. Previously, an invalid XPath expression was evaluated and a hard-to-understand XPath error was raised by the query engine. [#3193] @flavorjones
 * `Schema#validate` returns errors on empty and malformed files. Previously, it would return errors on empty/malformed Documents, but not when reading from files. [#642] @flavorjones
 * `XML::Builder` is now consistent with how it sets block scope. Previously, missing methods with blocks on dynamically-created nodes were always handled by invoking `instance_eval(&block)` on the Builder, even when the Builder was yielding self for all other missing methods with blocks. [#1041] @flavorjones
+* `HTML4::DocumentFragment.parse` accepts `IO` input. Previously, it required a string and would raise a `TypeError` when passed an `IO`. [#2069] @sharvy
 * [CRuby] libgumbo (the HTML5 parser) treats reaching max-depth as EOF. This addresses a class of issues when the parser is interrupted in this way. [#3121] @stevecheckoway
 * [CRuby] Update node GC lifecycle to avoid a potential memory leak with fragments in libxml 2.13.0 caused by changes in `xmlAddChild`. [#3156] @flavorjones
 * [CRuby] libgumbo correctly prints nonstandard element names in error messages. [#3219] @stevecheckoway

diff --git a/lib/nokogiri/html4/document_fragment.rb b/lib/nokogiri/html4/document_fragment.rb
@@ -3,11 +3,72 @@
 module Nokogiri
   module HTML4
     class DocumentFragment < Nokogiri::XML::DocumentFragment
-      ####
-      # Create a Nokogiri::XML::DocumentFragment from +tags+, using +encoding+
+      #
+      # :call-seq:
+      #   parse(tags) => DocumentFragment
+      #   parse(tags, encoding) => DocumentFragment
+      #   parse(tags, encoding, options) => DocumentFragment
+      #   parse(tags, encoding) { |options| ... } => DocumentFragment
+      #
+      # Parse an HTML4 fragment.
+      #
+      # [Parameters]
+      # - +tags+ (optional String, or any object that responds to +#read+ such as an IO, or
+      #   StringIO)
+      # - +encoding+ (optional String) the name of the encoding that should be used when processing
+      #   the document.  (default +nil+ for auto-detection)
+      # - +options+ (optional) configuration object that sets options during parsing, such as
+      #   Nokogiri::XML::ParseOptions::RECOVER. See Nokogiri::XML::ParseOptions for more
+      #   information.
+      #
+      # [Yields] If present, the block will be passed a Nokogiri::XML::ParseOptions object to modify
+      #   before the fragment is parsed. See Nokogiri::XML::ParseOptions for more information.
+      #
+      # [Returns] DocumentFragment
+      #
+      # *Example:* Parsing a string
+      #
+      #   fragment = DocumentFragment.parse("<div>Hello World</div>")
+      #
+      # *Example:* Parsing an IO
+      #
+      #   fragment = File.open("fragment.html") do |file|
+      #     DocumentFragment.parse(file)
+      #   end
+      #
+      # *Example:* Specifying encoding
+      #
+      #   fragment = DocumentFragment.parse(input, "EUC-JP")
+      #
+      # *Example:* Setting parse options dynamically
+      #
+      #   DocumentFragment.parse("<div>Hello World") do |options|
+      #     options.huge.pedantic
+      #   end
+      #
       def self.parse(tags, encoding = nil, options = XML::ParseOptions::DEFAULT_HTML, &block)
         doc = HTML4::Document.new
 
+        if tags.respond_to?(:read)
+          # Handle IO-like objects (IO, File, StringIO, etc.)
+          # The _read_ method of these objects doesn't accept an +encoding+ parameter.
+          # Encoding is usually set when the IO object is created or opened,
+          # or by using the _set_encoding_ method.
+          #
+          # 1. If +encoding+ is provided and the object supports _set_encoding_,
+          #    set the encoding before reading.
+          # 2. Read the content from the IO-like object.
+          #
+          # Note: After reading, the content's encoding will be:
+          # - The encoding set by _set_encoding_ if it was called
+          # - The default encoding of the IO object otherwise
+          #
+          # For StringIO specifically, _set_encoding_ affects only the internal string,
+          # not how the data is read out.
+          tags.set_encoding(encoding) if encoding && tags.respond_to?(:set_encoding)
+          tags = tags.read
+        end
+
         encoding ||= if tags.respond_to?(:encoding)
           encoding = tags.encoding
           if encoding == ::Encoding::ASCII_8BIT
@@ -24,6 +85,8 @@ def self.parse(tags, encoding = nil, options = XML::ParseOptions::DEFAULT_HTML,
         new(doc, tags, nil, options, &block)
       end
 
+      # It's recommended to use either DocumentFragment.parse or XML::Node#parse rather than call this
+      # method directly.
       def initialize(document, tags = nil, ctx = nil, options = XML::ParseOptions::DEFAULT_HTML) # rubocop:disable Lint/MissingSuper
         return self unless tags
 

diff --git a/test/html4/test_document_fragment.rb b/test/html4/test_document_fragment.rb
@@ -270,6 +270,15 @@ def test_dup_should_create_an_html_document_fragment
           assert_instance_of(Nokogiri::HTML4::DocumentFragment, duplicate)
         end
 
+        def test_parse_with_io
+          fragment = Nokogiri::HTML4::DocumentFragment.parse(StringIO.new("<div>hello</div>"), "UTF-8")
+          assert_instance_of(HTML4::DocumentFragment, fragment)
+          assert_equal("<div>hello</div>", fragment.to_s)
+
+          fragment = Nokogiri::HTML4::DocumentFragment.parse(StringIO.new("<div>hello</div>"))
+          assert_equal("<div>hello</div>", fragment.to_s)
+        end
+
         describe "encoding" do
           describe "#fragment" do
             it "parses an encoded string" do