Handle non-unicode payload in Logstash. (#16072) (#16168)

* A logic to handle non-unicode payload in Logstash. Co-authored-by: Ry Biesemeyer <[email protected]> * Upgrade jrjackson to 0.4.20 * Code review: simplify the logic with a standard String#encode interface with replace option. Co-authored-by: Ry Biesemeyer <[email protected]> --------- Co-authored-by: Ry Biesemeyer <[email protected]> Co-authored-by: Ry Biesemeyer <[email protected]> (cherry picked from commit 979d30d)
elastic · May 21, 2024 · 7277a36 · 7277a36
1 parent 304b8b2
commit 7277a36
Show file tree

Hide file tree

Showing 5 changed files with 151 additions and 2 deletions.
diff --git a/logstash-core/lib/logstash/json.rb b/logstash-core/lib/logstash/json.rb
@@ -16,6 +16,7 @@
 # under the License.
 
 require "logstash/environment"
+require "logstash/util/unicode_normalizer"
 require "jrjackson"
 
 module LogStash
@@ -32,16 +33,35 @@ def jruby_load(data, options = {})
  end
 
  def jruby_dump(o, options = {})
+ encoding_normalized_data = normalize_encoding(o.dup).freeze
+
  # TODO [guyboertje] remove these comments in 5.0
  # test for enumerable here to work around an omission in JrJackson::Json.dump to
  # also look for Java::JavaUtil::ArrayList, see TODO submit issue
  # o.is_a?(Enumerable) ? JrJackson::Raw.generate(o) : JrJackson::Json.dump(o)
- JrJackson::Base.generate(o, options)
+ JrJackson::Base.generate(encoding_normalized_data, options)
  rescue => e
  raise LogStash::Json::GeneratorError.new(e.message)
  end
 
  alias_method :load, "jruby_load".to_sym
  alias_method :dump, "jruby_dump".to_sym
+
+ private
+ def normalize_encoding(data)
+ case data
+ when String
+ LogStash::UnicodeNormalizer.normalize_string_encoding(data)
+ when Array
+ data.map { |item| normalize_encoding(item) }
+ when Hash
+ # origin key might change when normalizing, so requires transformation
+ data.to_hash # if coming from jruby objects such as UnmodifiableMap
+ .transform_keys { |key| normalize_encoding(key) }
+ .transform_values { |value| normalize_encoding(value) }
+ else
+ data # use as it is
+ end
+ end
  end
 end
diff --git a/logstash-core/lib/logstash/util/unicode_normalizer.rb b/logstash-core/lib/logstash/util/unicode_normalizer.rb
@@ -0,0 +1,38 @@
+# Licensed to Elasticsearch B.V. under one or more contributor
+# license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright
+# ownership. Elasticsearch B.V. licenses this file to you under
+# the Apache License, Version 2.0 (the "License"); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+module LogStash
+
+ # A class to normalize the invalid unicode data
+ class UnicodeNormalizer
+
+ include LogStash::Util::Loggable
+
+ # Tries to normalize input string to UTF-8 when
+ # input string encoding is not UTF-8,
+ # and replaces invalid unicode bytes with replacement characters ('uFFFD')
+ # string_data - The String data to be normalized.
+ # Returns the normalized string data.
+ def self.normalize_string_encoding(string_data)
+ # when given BINARY-flagged string, assume it is UTF-8 so that
+ # subsequent cleanup retains valid UTF-8 sequences
+ source_encoding = string_data.encoding
+ source_encoding = Encoding::UTF_8 if source_encoding == Encoding::BINARY
+ string_data.encode(Encoding::UTF_8, source_encoding, invalid: :replace, undef: :replace).scrub
+ end
+ end
+end
diff --git a/logstash-core/lib/logstash/util/unicode_trimmer.rb b/logstash-core/lib/logstash/util/unicode_trimmer.rb
@@ -1,3 +1,20 @@
+# Licensed to Elasticsearch B.V. under one or more contributor
+# license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright
+# ownership. Elasticsearch B.V. licenses this file to you under
+# the Apache License, Version 2.0 (the "License"); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
 # encoding: utf-8
 
 module LogStash::Util::UnicodeTrimmer

diff --git a/logstash-core/spec/logstash/json_spec.rb b/logstash-core/spec/logstash/json_spec.rb
@@ -118,4 +118,78 @@
  o = LogStash::Json.load(" ")
  expect(o).to be_nil
  end
+
+ context "Unicode edge-cases" do
+ matcher :be_utf8 do
+ match(:notify_expectation_failures => true) do |actual|
+ aggregate_failures do
+ expect(actual).to have_attributes(:encoding => Encoding::UTF_8, :valid_encoding? => true)
+ expect(actual.bytes).to eq(@expected_bytes) unless @expected_bytes.nil?
+ end
+ end
+ chain :with_bytes do |expected_bytes|
+ @expected_bytes = expected_bytes
+ end
+ end
+
+ let(:result) { LogStash::Json::dump(input) }
+
+ context "with valid non-unicode encoding" do
+ let(:input) { "Th\xEFs \xCCs W\xCFnd\xD8w\x8A".b.force_encoding(Encoding::WINDOWS_1252).freeze }
+ it 'transcodes to equivalent UTF-8 code-points' do
+ aggregate_failures do
+ expect(result).to be_utf8.with_bytes("\u{22}Th\u{EF}s \u{CC}s W\u{CF}nd\u{D8}w\u{160}\u{22}".bytes)
+ end
+ end
+ end
+
+ context "with unicode that has invalid sequences" do
+ let(:input) { "Thïs is a not-quite-v\xCEalid uni\xF0\x9D\x84code string 💖ok".b.force_encoding(Encoding::UTF_8).freeze }
+ it 'replaces each invalid sequence with the xFFFD replacement character' do
+ expect(result).to be_utf8.with_bytes("\x22Thïs is a not-quite-v\u{FFFD}alid uni\u{FFFD}code string 💖ok\x22".bytes)
+ end
+ end
+
+ context 'with valid unicode' do
+ let(:input) { "valid \u{A7}\u{a9c5}\u{18a5}\u{1f984} unicode".encode('UTF-8').freeze }
+ it 'keeps the unicode in-tact' do
+ expect(result).to be_utf8.with_bytes(('"' + input + '"').bytes)
+ end
+ end
+
+ context 'with binary-flagged input' do
+
+ context 'that contains only lower-ascii' do
+ let(:input) { "hello, world. This is a test including newline(\x0A) literal-backslash(\x5C) double-quote(\x22)".b.force_encoding(Encoding::BINARY).freeze}
+ it 'does not munge the bytes' do
+ expect(result).to be_utf8.with_bytes("\x22hello, world. This is a test including newline(\x5Cn) literal-backslash(\x5C\x5C) double-quote(\x5C\x22)\x22".bytes)
+ end
+ end
+
+ context 'that contains bytes outside lower-ascii' do
+ let(:input) { "Thïs is a not-quite-v\xCEalid uni\xF0\x9D\x84code string 💖ok".b.force_encoding(Encoding::BINARY).freeze }
+ it 'replaces each invalid sequence with the xFFFD replacement character' do
+ expect(result).to be_utf8.with_bytes("\x22Thïs is a not-quite-v\u{FFFD}alid uni\u{FFFD}code string 💖ok\x22".bytes)
+ end
+ end
+
+ end
+
+ context 'with hash data structure' do
+ let(:input) {{"Th\xEFs key and".b.force_encoding(Encoding::WINDOWS_1252).freeze =>
+ {"Thïs key also".b.force_encoding(Encoding::UTF_8).freeze => "not-quite-v\xCEalid uni\xF0\x9D\x84code string 💖ok".b.force_encoding(Encoding::UTF_8).freeze}}}
+ it 'normalizes and replaces each invalid key-value with the xFFFD replacement character' do
+ expect(result).to be_utf8.with_bytes("{\"Th\u{EF}s key and\":{\"Thïs key also\":\"not-quite-v\u{FFFD}alid uni\u{FFFD}code string 💖ok\"}}".bytes)
+ end
+ end
+
+ context 'with array data structure' do
+ let(:input) {["Th\xEFs entry and".b.force_encoding(Encoding::WINDOWS_1252).freeze,
+ "Thïs entry also".b.force_encoding(Encoding::UTF_8).freeze,
+ "not-quite-v\xCEalid uni\xF0\x9D\x84code strings 💖ok".b.force_encoding(Encoding::UTF_8).freeze]}
+ it 'normalizes and replaces each invalid array values with the xFFFD replacement character' do
+ expect(result).to be_utf8.with_bytes("[\"Th\u{EF}s entry and\",\"Thïs entry also\",\"not-quite-v\u{FFFD}alid uni\u{FFFD}code strings 💖ok\"]".bytes)
+ end
+ end
+ end
 end
diff --git a/versions.yml b/versions.yml
@@ -24,6 +24,6 @@ jruby:
 # Note: this file is copied to the root of logstash-core because its gemspec needs it when
 # bundler evaluates the gemspec via bin/logstash
 # Ensure Jackson version here is kept in sync with version used by jrjackson gem
-jrjackson: 0.4.18
+jrjackson: 0.4.20
 jackson: 2.15.3
 jackson-databind: 2.15.3