Skip to content

Commit

Permalink
Handle non-unicode payload in Logstash. (#16072) (#16168)
Browse files Browse the repository at this point in the history
* A logic to handle non-unicode payload in Logstash.

Co-authored-by: Ry Biesemeyer <[email protected]>

* Upgrade jrjackson to 0.4.20

* Code review: simplify the logic with a standard String#encode interface with replace option.

Co-authored-by: Ry Biesemeyer <[email protected]>

---------

Co-authored-by: Ry Biesemeyer <[email protected]>
Co-authored-by: Ry Biesemeyer <[email protected]>
(cherry picked from commit 979d30d)
  • Loading branch information
mashhurs committed May 21, 2024
1 parent 304b8b2 commit 7277a36
Show file tree
Hide file tree
Showing 5 changed files with 151 additions and 2 deletions.
22 changes: 21 additions & 1 deletion logstash-core/lib/logstash/json.rb
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
# under the License.

require "logstash/environment"
require "logstash/util/unicode_normalizer"
require "jrjackson"

module LogStash
Expand All @@ -32,16 +33,35 @@ def jruby_load(data, options = {})
end

def jruby_dump(o, options = {})
encoding_normalized_data = normalize_encoding(o.dup).freeze

# TODO [guyboertje] remove these comments in 5.0
# test for enumerable here to work around an omission in JrJackson::Json.dump to
# also look for Java::JavaUtil::ArrayList, see TODO submit issue
# o.is_a?(Enumerable) ? JrJackson::Raw.generate(o) : JrJackson::Json.dump(o)
JrJackson::Base.generate(o, options)
JrJackson::Base.generate(encoding_normalized_data, options)
rescue => e
raise LogStash::Json::GeneratorError.new(e.message)
end

alias_method :load, "jruby_load".to_sym
alias_method :dump, "jruby_dump".to_sym

private
def normalize_encoding(data)
case data
when String
LogStash::UnicodeNormalizer.normalize_string_encoding(data)
when Array
data.map { |item| normalize_encoding(item) }
when Hash
# origin key might change when normalizing, so requires transformation
data.to_hash # if coming from jruby objects such as UnmodifiableMap
.transform_keys { |key| normalize_encoding(key) }
.transform_values { |value| normalize_encoding(value) }
else
data # use as it is
end
end
end
end
38 changes: 38 additions & 0 deletions logstash-core/lib/logstash/util/unicode_normalizer.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
# Licensed to Elasticsearch B.V. under one or more contributor
# license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright
# ownership. Elasticsearch B.V. licenses this file to you under
# the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.

module LogStash

# A class to normalize the invalid unicode data
class UnicodeNormalizer

include LogStash::Util::Loggable

# Tries to normalize input string to UTF-8 when
# input string encoding is not UTF-8,
# and replaces invalid unicode bytes with replacement characters ('uFFFD')
# string_data - The String data to be normalized.
# Returns the normalized string data.
def self.normalize_string_encoding(string_data)
# when given BINARY-flagged string, assume it is UTF-8 so that
# subsequent cleanup retains valid UTF-8 sequences
source_encoding = string_data.encoding
source_encoding = Encoding::UTF_8 if source_encoding == Encoding::BINARY
string_data.encode(Encoding::UTF_8, source_encoding, invalid: :replace, undef: :replace).scrub
end
end
end
17 changes: 17 additions & 0 deletions logstash-core/lib/logstash/util/unicode_trimmer.rb
Original file line number Diff line number Diff line change
@@ -1,3 +1,20 @@
# Licensed to Elasticsearch B.V. under one or more contributor
# license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright
# ownership. Elasticsearch B.V. licenses this file to you under
# the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.

# encoding: utf-8

module LogStash::Util::UnicodeTrimmer
Expand Down
74 changes: 74 additions & 0 deletions logstash-core/spec/logstash/json_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -118,4 +118,78 @@
o = LogStash::Json.load(" ")
expect(o).to be_nil
end

context "Unicode edge-cases" do
matcher :be_utf8 do
match(:notify_expectation_failures => true) do |actual|
aggregate_failures do
expect(actual).to have_attributes(:encoding => Encoding::UTF_8, :valid_encoding? => true)
expect(actual.bytes).to eq(@expected_bytes) unless @expected_bytes.nil?
end
end
chain :with_bytes do |expected_bytes|
@expected_bytes = expected_bytes
end
end

let(:result) { LogStash::Json::dump(input) }

context "with valid non-unicode encoding" do
let(:input) { "Th\xEFs \xCCs W\xCFnd\xD8w\x8A".b.force_encoding(Encoding::WINDOWS_1252).freeze }
it 'transcodes to equivalent UTF-8 code-points' do
aggregate_failures do
expect(result).to be_utf8.with_bytes("\u{22}Th\u{EF}s \u{CC}s W\u{CF}nd\u{D8}w\u{160}\u{22}".bytes)
end
end
end

context "with unicode that has invalid sequences" do
let(:input) { "Thïs is a not-quite-v\xCEalid uni\xF0\x9D\x84code string 💖ok".b.force_encoding(Encoding::UTF_8).freeze }
it 'replaces each invalid sequence with the xFFFD replacement character' do
expect(result).to be_utf8.with_bytes("\x22Thïs is a not-quite-v\u{FFFD}alid uni\u{FFFD}code string 💖ok\x22".bytes)
end
end

context 'with valid unicode' do
let(:input) { "valid \u{A7}\u{a9c5}\u{18a5}\u{1f984} unicode".encode('UTF-8').freeze }
it 'keeps the unicode in-tact' do
expect(result).to be_utf8.with_bytes(('"' + input + '"').bytes)
end
end

context 'with binary-flagged input' do

context 'that contains only lower-ascii' do
let(:input) { "hello, world. This is a test including newline(\x0A) literal-backslash(\x5C) double-quote(\x22)".b.force_encoding(Encoding::BINARY).freeze}
it 'does not munge the bytes' do
expect(result).to be_utf8.with_bytes("\x22hello, world. This is a test including newline(\x5Cn) literal-backslash(\x5C\x5C) double-quote(\x5C\x22)\x22".bytes)
end
end

context 'that contains bytes outside lower-ascii' do
let(:input) { "Thïs is a not-quite-v\xCEalid uni\xF0\x9D\x84code string 💖ok".b.force_encoding(Encoding::BINARY).freeze }
it 'replaces each invalid sequence with the xFFFD replacement character' do
expect(result).to be_utf8.with_bytes("\x22Thïs is a not-quite-v\u{FFFD}alid uni\u{FFFD}code string 💖ok\x22".bytes)
end
end

end

context 'with hash data structure' do
let(:input) {{"Th\xEFs key and".b.force_encoding(Encoding::WINDOWS_1252).freeze =>
{"Thïs key also".b.force_encoding(Encoding::UTF_8).freeze => "not-quite-v\xCEalid uni\xF0\x9D\x84code string 💖ok".b.force_encoding(Encoding::UTF_8).freeze}}}
it 'normalizes and replaces each invalid key-value with the xFFFD replacement character' do
expect(result).to be_utf8.with_bytes("{\"Th\u{EF}s key and\":{\"Thïs key also\":\"not-quite-v\u{FFFD}alid uni\u{FFFD}code string 💖ok\"}}".bytes)
end
end

context 'with array data structure' do
let(:input) {["Th\xEFs entry and".b.force_encoding(Encoding::WINDOWS_1252).freeze,
"Thïs entry also".b.force_encoding(Encoding::UTF_8).freeze,
"not-quite-v\xCEalid uni\xF0\x9D\x84code strings 💖ok".b.force_encoding(Encoding::UTF_8).freeze]}
it 'normalizes and replaces each invalid array values with the xFFFD replacement character' do
expect(result).to be_utf8.with_bytes("[\"Th\u{EF}s entry and\",\"Thïs entry also\",\"not-quite-v\u{FFFD}alid uni\u{FFFD}code strings 💖ok\"]".bytes)
end
end
end
end
2 changes: 1 addition & 1 deletion versions.yml
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,6 @@ jruby:
# Note: this file is copied to the root of logstash-core because its gemspec needs it when
# bundler evaluates the gemspec via bin/logstash
# Ensure Jackson version here is kept in sync with version used by jrjackson gem
jrjackson: 0.4.18
jrjackson: 0.4.20
jackson: 2.15.3
jackson-databind: 2.15.3

0 comments on commit 7277a36

Please sign in to comment.