diff --git a/logstash-core/lib/logstash/json.rb b/logstash-core/lib/logstash/json.rb index 905c9cc1280..6ffe70e539e 100644 --- a/logstash-core/lib/logstash/json.rb +++ b/logstash-core/lib/logstash/json.rb @@ -16,6 +16,7 @@ # under the License. require "logstash/environment" +require "logstash/util/unicode_normalizer" require "jrjackson" module LogStash @@ -32,16 +33,35 @@ def jruby_load(data, options = {}) end def jruby_dump(o, options = {}) + encoding_normalized_data = normalize_encoding(o.dup).freeze + # TODO [guyboertje] remove these comments in 5.0 # test for enumerable here to work around an omission in JrJackson::Json.dump to # also look for Java::JavaUtil::ArrayList, see TODO submit issue # o.is_a?(Enumerable) ? JrJackson::Raw.generate(o) : JrJackson::Json.dump(o) - JrJackson::Base.generate(o, options) + JrJackson::Base.generate(encoding_normalized_data, options) rescue => e raise LogStash::Json::GeneratorError.new(e.message) end alias_method :load, "jruby_load".to_sym alias_method :dump, "jruby_dump".to_sym + + private + def normalize_encoding(data) + case data + when String + LogStash::UnicodeNormalizer.normalize_string_encoding(data) + when Array + data.map { |item| normalize_encoding(item) } + when Hash + # origin key might change when normalizing, so requires transformation + data.to_hash # if coming from jruby objects such as UnmodifiableMap + .transform_keys { |key| normalize_encoding(key) } + .transform_values { |value| normalize_encoding(value) } + else + data # use as it is + end + end end end diff --git a/logstash-core/lib/logstash/util/unicode_normalizer.rb b/logstash-core/lib/logstash/util/unicode_normalizer.rb new file mode 100644 index 00000000000..7397a3eb34d --- /dev/null +++ b/logstash-core/lib/logstash/util/unicode_normalizer.rb @@ -0,0 +1,38 @@ +# Licensed to Elasticsearch B.V. under one or more contributor +# license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright +# ownership. Elasticsearch B.V. licenses this file to you under +# the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +module LogStash + + # A class to normalize the invalid unicode data + class UnicodeNormalizer + + include LogStash::Util::Loggable + + # Tries to normalize input string to UTF-8 when + # input string encoding is not UTF-8, + # and replaces invalid unicode bytes with replacement characters ('uFFFD') + # string_data - The String data to be normalized. + # Returns the normalized string data. + def self.normalize_string_encoding(string_data) + # when given BINARY-flagged string, assume it is UTF-8 so that + # subsequent cleanup retains valid UTF-8 sequences + source_encoding = string_data.encoding + source_encoding = Encoding::UTF_8 if source_encoding == Encoding::BINARY + string_data.encode(Encoding::UTF_8, source_encoding, invalid: :replace, undef: :replace).scrub + end + end +end diff --git a/logstash-core/lib/logstash/util/unicode_trimmer.rb b/logstash-core/lib/logstash/util/unicode_trimmer.rb index ae973d1ff10..68468e67ea7 100644 --- a/logstash-core/lib/logstash/util/unicode_trimmer.rb +++ b/logstash-core/lib/logstash/util/unicode_trimmer.rb @@ -1,3 +1,20 @@ +# Licensed to Elasticsearch B.V. under one or more contributor +# license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright +# ownership. Elasticsearch B.V. licenses this file to you under +# the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + # encoding: utf-8 module LogStash::Util::UnicodeTrimmer diff --git a/logstash-core/spec/logstash/json_spec.rb b/logstash-core/spec/logstash/json_spec.rb index 8cd4ffb04ea..a53e8375f96 100644 --- a/logstash-core/spec/logstash/json_spec.rb +++ b/logstash-core/spec/logstash/json_spec.rb @@ -118,4 +118,78 @@ o = LogStash::Json.load(" ") expect(o).to be_nil end + + context "Unicode edge-cases" do + matcher :be_utf8 do + match(:notify_expectation_failures => true) do |actual| + aggregate_failures do + expect(actual).to have_attributes(:encoding => Encoding::UTF_8, :valid_encoding? => true) + expect(actual.bytes).to eq(@expected_bytes) unless @expected_bytes.nil? + end + end + chain :with_bytes do |expected_bytes| + @expected_bytes = expected_bytes + end + end + + let(:result) { LogStash::Json::dump(input) } + + context "with valid non-unicode encoding" do + let(:input) { "Th\xEFs \xCCs W\xCFnd\xD8w\x8A".b.force_encoding(Encoding::WINDOWS_1252).freeze } + it 'transcodes to equivalent UTF-8 code-points' do + aggregate_failures do + expect(result).to be_utf8.with_bytes("\u{22}Th\u{EF}s \u{CC}s W\u{CF}nd\u{D8}w\u{160}\u{22}".bytes) + end + end + end + + context "with unicode that has invalid sequences" do + let(:input) { "Thïs is a not-quite-v\xCEalid uni\xF0\x9D\x84code string 💖ok".b.force_encoding(Encoding::UTF_8).freeze } + it 'replaces each invalid sequence with the xFFFD replacement character' do + expect(result).to be_utf8.with_bytes("\x22Thïs is a not-quite-v\u{FFFD}alid uni\u{FFFD}code string 💖ok\x22".bytes) + end + end + + context 'with valid unicode' do + let(:input) { "valid \u{A7}\u{a9c5}\u{18a5}\u{1f984} unicode".encode('UTF-8').freeze } + it 'keeps the unicode in-tact' do + expect(result).to be_utf8.with_bytes(('"' + input + '"').bytes) + end + end + + context 'with binary-flagged input' do + + context 'that contains only lower-ascii' do + let(:input) { "hello, world. This is a test including newline(\x0A) literal-backslash(\x5C) double-quote(\x22)".b.force_encoding(Encoding::BINARY).freeze} + it 'does not munge the bytes' do + expect(result).to be_utf8.with_bytes("\x22hello, world. This is a test including newline(\x5Cn) literal-backslash(\x5C\x5C) double-quote(\x5C\x22)\x22".bytes) + end + end + + context 'that contains bytes outside lower-ascii' do + let(:input) { "Thïs is a not-quite-v\xCEalid uni\xF0\x9D\x84code string 💖ok".b.force_encoding(Encoding::BINARY).freeze } + it 'replaces each invalid sequence with the xFFFD replacement character' do + expect(result).to be_utf8.with_bytes("\x22Thïs is a not-quite-v\u{FFFD}alid uni\u{FFFD}code string 💖ok\x22".bytes) + end + end + + end + + context 'with hash data structure' do + let(:input) {{"Th\xEFs key and".b.force_encoding(Encoding::WINDOWS_1252).freeze => + {"Thïs key also".b.force_encoding(Encoding::UTF_8).freeze => "not-quite-v\xCEalid uni\xF0\x9D\x84code string 💖ok".b.force_encoding(Encoding::UTF_8).freeze}}} + it 'normalizes and replaces each invalid key-value with the xFFFD replacement character' do + expect(result).to be_utf8.with_bytes("{\"Th\u{EF}s key and\":{\"Thïs key also\":\"not-quite-v\u{FFFD}alid uni\u{FFFD}code string 💖ok\"}}".bytes) + end + end + + context 'with array data structure' do + let(:input) {["Th\xEFs entry and".b.force_encoding(Encoding::WINDOWS_1252).freeze, + "Thïs entry also".b.force_encoding(Encoding::UTF_8).freeze, + "not-quite-v\xCEalid uni\xF0\x9D\x84code strings 💖ok".b.force_encoding(Encoding::UTF_8).freeze]} + it 'normalizes and replaces each invalid array values with the xFFFD replacement character' do + expect(result).to be_utf8.with_bytes("[\"Th\u{EF}s entry and\",\"Thïs entry also\",\"not-quite-v\u{FFFD}alid uni\u{FFFD}code strings 💖ok\"]".bytes) + end + end + end end diff --git a/versions.yml b/versions.yml index 30fb6545a99..e20b233c5b1 100644 --- a/versions.yml +++ b/versions.yml @@ -24,6 +24,6 @@ jruby: # Note: this file is copied to the root of logstash-core because its gemspec needs it when # bundler evaluates the gemspec via bin/logstash # Ensure Jackson version here is kept in sync with version used by jrjackson gem -jrjackson: 0.4.18 +jrjackson: 0.4.20 jackson: 2.16.2 jackson-databind: 2.16.2