Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Handle non-unicode payload in Logstash. #16072

Merged
merged 6 commits into from
May 16, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 21 additions & 1 deletion logstash-core/lib/logstash/json.rb
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
# under the License.

require "logstash/environment"
require "logstash/util/unicode_normalizer"
require "jrjackson"

module LogStash
Expand All @@ -32,16 +33,35 @@ def jruby_load(data, options = {})
end

def jruby_dump(o, options = {})
encoding_normalized_data = normalize_encoding(o.dup).freeze

# TODO [guyboertje] remove these comments in 5.0
# test for enumerable here to work around an omission in JrJackson::Json.dump to
# also look for Java::JavaUtil::ArrayList, see TODO submit issue
# o.is_a?(Enumerable) ? JrJackson::Raw.generate(o) : JrJackson::Json.dump(o)
JrJackson::Base.generate(o, options)
JrJackson::Base.generate(encoding_normalized_data, options)
rescue => e
raise LogStash::Json::GeneratorError.new(e.message)
end

alias_method :load, "jruby_load".to_sym
alias_method :dump, "jruby_dump".to_sym

private
def normalize_encoding(data)
case data
when String
LogStash::UnicodeNormalizer.normalize_string_encoding(data)
when Array
data.map { |item| normalize_encoding(item) }
when Hash
# origin key might change when normalizing, so requires transformation
data.to_hash # if coming from jruby objects such as UnmodifiableMap
.transform_keys { |key| normalize_encoding(key) }
.transform_values { |value| normalize_encoding(value) }
else
data # use as it is
end
end
end
end
38 changes: 38 additions & 0 deletions logstash-core/lib/logstash/util/unicode_normalizer.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
# Licensed to Elasticsearch B.V. under one or more contributor
# license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright
# ownership. Elasticsearch B.V. licenses this file to you under
# the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.

module LogStash

# A class to normalize the invalid unicode data
class UnicodeNormalizer

include LogStash::Util::Loggable

# Tries to normalize input string to UTF-8 when
# input string encoding is not UTF-8,
# and replaces invalid unicode bytes with replacement characters ('uFFFD')
# string_data - The String data to be normalized.
# Returns the normalized string data.
def self.normalize_string_encoding(string_data)
# when given BINARY-flagged string, assume it is UTF-8 so that
# subsequent cleanup retains valid UTF-8 sequences
source_encoding = string_data.encoding
source_encoding = Encoding::UTF_8 if source_encoding == Encoding::BINARY
string_data.encode(Encoding::UTF_8, source_encoding, invalid: :replace, undef: :replace).scrub
end
end
end
17 changes: 17 additions & 0 deletions logstash-core/lib/logstash/util/unicode_trimmer.rb
Original file line number Diff line number Diff line change
@@ -1,3 +1,20 @@
# Licensed to Elasticsearch B.V. under one or more contributor
# license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright
# ownership. Elasticsearch B.V. licenses this file to you under
# the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.

# encoding: utf-8

module LogStash::Util::UnicodeTrimmer
Expand Down
74 changes: 74 additions & 0 deletions logstash-core/spec/logstash/json_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -118,4 +118,78 @@
o = LogStash::Json.load(" ")
expect(o).to be_nil
end

context "Unicode edge-cases" do
matcher :be_utf8 do
match(:notify_expectation_failures => true) do |actual|
aggregate_failures do
expect(actual).to have_attributes(:encoding => Encoding::UTF_8, :valid_encoding? => true)
expect(actual.bytes).to eq(@expected_bytes) unless @expected_bytes.nil?
end
end
chain :with_bytes do |expected_bytes|
@expected_bytes = expected_bytes
end
end

let(:result) { LogStash::Json::dump(input) }

context "with valid non-unicode encoding" do
let(:input) { "Th\xEFs \xCCs W\xCFnd\xD8w\x8A".b.force_encoding(Encoding::WINDOWS_1252).freeze }
it 'transcodes to equivalent UTF-8 code-points' do
aggregate_failures do
expect(result).to be_utf8.with_bytes("\u{22}Th\u{EF}s \u{CC}s W\u{CF}nd\u{D8}w\u{160}\u{22}".bytes)
end
end
end

context "with unicode that has invalid sequences" do
let(:input) { "Thïs is a not-quite-v\xCEalid uni\xF0\x9D\x84code string 💖ok".b.force_encoding(Encoding::UTF_8).freeze }
it 'replaces each invalid sequence with the xFFFD replacement character' do
expect(result).to be_utf8.with_bytes("\x22Thïs is a not-quite-v\u{FFFD}alid uni\u{FFFD}code string 💖ok\x22".bytes)
end
end

context 'with valid unicode' do
let(:input) { "valid \u{A7}\u{a9c5}\u{18a5}\u{1f984} unicode".encode('UTF-8').freeze }
it 'keeps the unicode in-tact' do
expect(result).to be_utf8.with_bytes(('"' + input + '"').bytes)
end
end

context 'with binary-flagged input' do

context 'that contains only lower-ascii' do
let(:input) { "hello, world. This is a test including newline(\x0A) literal-backslash(\x5C) double-quote(\x22)".b.force_encoding(Encoding::BINARY).freeze}
it 'does not munge the bytes' do
expect(result).to be_utf8.with_bytes("\x22hello, world. This is a test including newline(\x5Cn) literal-backslash(\x5C\x5C) double-quote(\x5C\x22)\x22".bytes)
end
end

context 'that contains bytes outside lower-ascii' do
let(:input) { "Thïs is a not-quite-v\xCEalid uni\xF0\x9D\x84code string 💖ok".b.force_encoding(Encoding::BINARY).freeze }
it 'replaces each invalid sequence with the xFFFD replacement character' do
expect(result).to be_utf8.with_bytes("\x22Thïs is a not-quite-v\u{FFFD}alid uni\u{FFFD}code string 💖ok\x22".bytes)
end
end

end

context 'with hash data structure' do
let(:input) {{"Th\xEFs key and".b.force_encoding(Encoding::WINDOWS_1252).freeze =>
{"Thïs key also".b.force_encoding(Encoding::UTF_8).freeze => "not-quite-v\xCEalid uni\xF0\x9D\x84code string 💖ok".b.force_encoding(Encoding::UTF_8).freeze}}}
it 'normalizes and replaces each invalid key-value with the xFFFD replacement character' do
expect(result).to be_utf8.with_bytes("{\"Th\u{EF}s key and\":{\"Thïs key also\":\"not-quite-v\u{FFFD}alid uni\u{FFFD}code string 💖ok\"}}".bytes)
end
end

context 'with array data structure' do
let(:input) {["Th\xEFs entry and".b.force_encoding(Encoding::WINDOWS_1252).freeze,
"Thïs entry also".b.force_encoding(Encoding::UTF_8).freeze,
"not-quite-v\xCEalid uni\xF0\x9D\x84code strings 💖ok".b.force_encoding(Encoding::UTF_8).freeze]}
it 'normalizes and replaces each invalid array values with the xFFFD replacement character' do
expect(result).to be_utf8.with_bytes("[\"Th\u{EF}s entry and\",\"Thïs entry also\",\"not-quite-v\u{FFFD}alid uni\u{FFFD}code strings 💖ok\"]".bytes)
end
end
end
end
2 changes: 1 addition & 1 deletion versions.yml
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,6 @@ jruby:
# Note: this file is copied to the root of logstash-core because its gemspec needs it when
# bundler evaluates the gemspec via bin/logstash
# Ensure Jackson version here is kept in sync with version used by jrjackson gem
jrjackson: 0.4.18
jrjackson: 0.4.20
jackson: 2.16.2
jackson-databind: 2.16.2