elastic · mashhurs · May 16, 2024 · Apr 8, 2024 · Apr 10, 2024 · Apr 10, 2024
diff --git a/logstash-core/lib/logstash/json.rb b/logstash-core/lib/logstash/json.rb
@@ -16,6 +16,7 @@
 # under the License.
 
 require "logstash/environment"
+require "logstash/util/unicode_normalizer"
 require "jrjackson"
 
 module LogStash
@@ -32,16 +33,35 @@ def jruby_load(data, options = {})
     end
 
     def jruby_dump(o, options = {})
+      encoding_normalized_data = normalize_encoding(o.dup).freeze
+
       # TODO [guyboertje] remove these comments in 5.0
       # test for enumerable here to work around an omission in JrJackson::Json.dump to
       # also look for Java::JavaUtil::ArrayList, see TODO submit issue
       # o.is_a?(Enumerable) ? JrJackson::Raw.generate(o) : JrJackson::Json.dump(o)
-      JrJackson::Base.generate(o, options)
+      JrJackson::Base.generate(encoding_normalized_data, options)
     rescue => e
       raise LogStash::Json::GeneratorError.new(e.message)
     end
 
     alias_method :load, "jruby_load".to_sym
     alias_method :dump, "jruby_dump".to_sym
+
+    private
+    def normalize_encoding(data)
+      case data
+      when String
+        LogStash::UnicodeNormalizer.normalize_string_encoding(data)
+      when Array
+        data.map { |item| normalize_encoding(item) }
+      when Hash
+        # origin key might change when normalizing, so requires transformation
+        data.to_hash # if coming from jruby objects such as UnmodifiableMap
+            .transform_keys { |key| normalize_encoding(key) }
+            .transform_values { |value| normalize_encoding(value) }
+      else
+        data # use as it is
+      end
+    end
   end
 end
diff --git a/logstash-core/lib/logstash/util/unicode_normalizer.rb b/logstash-core/lib/logstash/util/unicode_normalizer.rb
@@ -0,0 +1,38 @@
+# Licensed to Elasticsearch B.V. under one or more contributor
+# license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright
+# ownership. Elasticsearch B.V. licenses this file to you under
+# the Apache License, Version 2.0 (the "License"); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#  http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+module LogStash
+
+  # A class to normalize the invalid unicode data
+  class UnicodeNormalizer
+
+    include LogStash::Util::Loggable
+
+    # Tries to normalize input string to UTF-8 when
+    #   input string encoding is not UTF-8,
+    #   and replaces invalid unicode bytes with replacement characters ('uFFFD')
+    # string_data - The String data to be normalized.
+    # Returns the normalized string data.
+    def self.normalize_string_encoding(string_data)
+      # when given BINARY-flagged string, assume it is UTF-8 so that
+      # subsequent cleanup retains valid UTF-8 sequences
+      source_encoding = string_data.encoding
+      source_encoding = Encoding::UTF_8 if source_encoding == Encoding::BINARY
+      string_data.encode(Encoding::UTF_8, source_encoding, invalid: :replace, undef: :replace).scrub
+    end
+  end
+end
diff --git a/logstash-core/lib/logstash/util/unicode_trimmer.rb b/logstash-core/lib/logstash/util/unicode_trimmer.rb
@@ -1,3 +1,20 @@
+# Licensed to Elasticsearch B.V. under one or more contributor
+# license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright
+# ownership. Elasticsearch B.V. licenses this file to you under
+# the Apache License, Version 2.0 (the "License"); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#  http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
 # encoding: utf-8
 
 module LogStash::Util::UnicodeTrimmer

diff --git a/logstash-core/spec/logstash/json_spec.rb b/logstash-core/spec/logstash/json_spec.rb
@@ -118,4 +118,78 @@
     o = LogStash::Json.load("  ")
     expect(o).to be_nil
   end
+
+  context "Unicode edge-cases" do
+    matcher :be_utf8 do
+      match(:notify_expectation_failures => true) do |actual|
+        aggregate_failures do
+          expect(actual).to have_attributes(:encoding => Encoding::UTF_8, :valid_encoding? => true)
+          expect(actual.bytes).to eq(@expected_bytes) unless @expected_bytes.nil?
+        end
+      end
+      chain :with_bytes do |expected_bytes|
+        @expected_bytes = expected_bytes
+      end
+    end
+
+    let(:result) { LogStash::Json::dump(input) }
+
+    context "with valid non-unicode encoding" do
+      let(:input) { "Th\xEFs \xCCs W\xCFnd\xD8w\x8A".b.force_encoding(Encoding::WINDOWS_1252).freeze }
+      it 'transcodes to equivalent UTF-8 code-points' do
+        aggregate_failures do
+          expect(result).to be_utf8.with_bytes("\u{22}Th\u{EF}s \u{CC}s W\u{CF}nd\u{D8}w\u{160}\u{22}".bytes)
+        end
+      end
+    end
+
+    context "with unicode that has invalid sequences" do
+      let(:input) { "Thïs is a not-quite-v\xCEalid uni\xF0\x9D\x84code string 💖ok".b.force_encoding(Encoding::UTF_8).freeze }
+      it 'replaces each invalid sequence with the xFFFD replacement character' do
+        expect(result).to be_utf8.with_bytes("\x22Thïs is a not-quite-v\u{FFFD}alid uni\u{FFFD}code string 💖ok\x22".bytes)
+      end
+    end
+
+    context 'with valid unicode' do
+      let(:input) { "valid \u{A7}\u{a9c5}\u{18a5}\u{1f984} unicode".encode('UTF-8').freeze }
+      it 'keeps the unicode in-tact' do
+        expect(result).to be_utf8.with_bytes(('"' + input + '"').bytes)
+      end
+    end
+
+    context 'with binary-flagged input' do
+
+      context 'that contains only lower-ascii' do
+        let(:input) { "hello, world. This is a test including newline(\x0A) literal-backslash(\x5C) double-quote(\x22)".b.force_encoding(Encoding::BINARY).freeze}
+        it 'does not munge the bytes' do
+          expect(result).to be_utf8.with_bytes("\x22hello, world. This is a test including newline(\x5Cn) literal-backslash(\x5C\x5C) double-quote(\x5C\x22)\x22".bytes)
+        end
+      end
+
+      context 'that contains bytes outside lower-ascii' do
+        let(:input) { "Thïs is a not-quite-v\xCEalid uni\xF0\x9D\x84code string 💖ok".b.force_encoding(Encoding::BINARY).freeze }
+        it 'replaces each invalid sequence with the xFFFD replacement character' do
+          expect(result).to be_utf8.with_bytes("\x22Thïs is a not-quite-v\u{FFFD}alid uni\u{FFFD}code string 💖ok\x22".bytes)
+        end
+      end
+
+    end
+
+    context 'with hash data structure' do
+      let(:input) {{"Th\xEFs key and".b.force_encoding(Encoding::WINDOWS_1252).freeze =>
+                      {"Thïs key also".b.force_encoding(Encoding::UTF_8).freeze => "not-quite-v\xCEalid uni\xF0\x9D\x84code string 💖ok".b.force_encoding(Encoding::UTF_8).freeze}}}
+      it 'normalizes and replaces each invalid key-value with the xFFFD replacement character' do
+        expect(result).to be_utf8.with_bytes("{\"Th\u{EF}s key and\":{\"Thïs key also\":\"not-quite-v\u{FFFD}alid uni\u{FFFD}code string 💖ok\"}}".bytes)
+      end
+    end
+
+    context 'with array data structure' do
+      let(:input) {["Th\xEFs entry and".b.force_encoding(Encoding::WINDOWS_1252).freeze,
+                    "Thïs entry also".b.force_encoding(Encoding::UTF_8).freeze,
+                    "not-quite-v\xCEalid uni\xF0\x9D\x84code strings 💖ok".b.force_encoding(Encoding::UTF_8).freeze]}
+      it 'normalizes and replaces each invalid array values with the xFFFD replacement character' do
+        expect(result).to be_utf8.with_bytes("[\"Th\u{EF}s entry and\",\"Thïs entry also\",\"not-quite-v\u{FFFD}alid uni\u{FFFD}code strings 💖ok\"]".bytes)
+      end
+    end
+  end
 end
diff --git a/versions.yml b/versions.yml
@@ -24,6 +24,6 @@ jruby:
 # Note: this file is copied to the root of logstash-core because its gemspec needs it when
 #       bundler evaluates the gemspec via bin/logstash
 # Ensure Jackson version here is kept in sync with version used by jrjackson gem
-jrjackson: 0.4.18
+jrjackson: 0.4.20
 jackson: 2.16.2
 jackson-databind: 2.16.2