From 8e41b90c39c5eb42773eac75fa0757cd3f15706a Mon Sep 17 00:00:00 2001 From: Nick Muerdter <12112+GUI@users.noreply.github.com> Date: Fri, 3 May 2024 23:33:53 -0600 Subject: [PATCH] More complete solution for handling invalid UTF-8 in any part of log In various benchmarks, lua-utf8 applied once at the end seems to be the most performant solution for handling this. --- .../proxy/hooks/init_preload_modules.lua | 1 + .../proxy/hooks/log_initial_proxy.lua | 21 +++-- src/api-umbrella/proxy/log_utils.lua | 26 ++----- test/proxy/logging/test_special_chars.rb | 77 +++++++++++-------- test/support/log_tail.rb | 4 +- 5 files changed, 72 insertions(+), 57 deletions(-) diff --git a/src/api-umbrella/proxy/hooks/init_preload_modules.lua b/src/api-umbrella/proxy/hooks/init_preload_modules.lua index 1c8cbbcc4..822223d7b 100644 --- a/src/api-umbrella/proxy/hooks/init_preload_modules.lua +++ b/src/api-umbrella/proxy/hooks/init_preload_modules.lua @@ -71,6 +71,7 @@ require "api-umbrella.utils.xpcall_error_handler" require "cjson" require "etlua" require "libcidr-ffi" +require "lua-utf8" require "lustache" require "ngx.re" require "pl.path" diff --git a/src/api-umbrella/proxy/hooks/log_initial_proxy.lua b/src/api-umbrella/proxy/hooks/log_initial_proxy.lua index 575de6573..afb053c81 100644 --- a/src/api-umbrella/proxy/hooks/log_initial_proxy.lua +++ b/src/api-umbrella/proxy/hooks/log_initial_proxy.lua @@ -1,4 +1,5 @@ local log_utils = require "api-umbrella.proxy.log_utils" +local utf8_clean = require("lua-utf8").clean local ignore_request = log_utils.ignore_request local ngx_ctx = ngx.ctx @@ -99,11 +100,21 @@ end local function log_request() -- Build the log message and send to Fluent Bit for processing. local data = build_log_data() - local message = json_encode(normalized_data(data)) - local _, err = send_message(message) - if err then - ngx.log(ngx.ERR, "failed to log message: ", err) - return + local original_json = json_encode(normalized_data(data)) + local message, was_valid_utf8 = utf8_clean(original_json) + if not was_valid_utf8 then + ngx.log(ngx.WARN, "log message contained invalid utf-8, original: ", original_json) + ngx.log(ngx.WARN, "log message contained invalid utf-8, cleaned: ", message) + elseif not message then + ngx.log(ngx.ERR, "failed to log message, message missing: ", original_json) + end + + if message then + local _, err = send_message(message) + if err then + ngx.log(ngx.ERR, "failed to log message: ", err) + return + end end -- After logging, cache any new cities we see from GeoIP in our database. diff --git a/src/api-umbrella/proxy/log_utils.lua b/src/api-umbrella/proxy/log_utils.lua index af5f045cb..6a393f501 100644 --- a/src/api-umbrella/proxy/log_utils.lua +++ b/src/api-umbrella/proxy/log_utils.lua @@ -51,16 +51,6 @@ local function uppercase_truncate(value, max_length) return string.upper(truncate_string(value, max_length)) end -local function sanitize_request_header(value) - local cleaned_value, _, gsub_err = re_gsub(value, "\xC0\xA2", "??", "jo") - if gsub_err then - ngx.log(ngx.ERR, "regex error: ", gsub_err) - return value - end - - return cleaned_value -end - local function remove_envoy_empty_header_value(value) if value and value == "-" then return nil @@ -296,20 +286,20 @@ function _M.normalized_data(data) api_backend_response_flags = remove_envoy_empty_header_value(truncate(data["api_backend_response_flags"], 20)), gatekeeper_denied_code = lowercase_truncate(data["gatekeeper_denied_code"], 50), request_id = lowercase_truncate(data["request_id"], 20), - request_accept = sanitize_request_header(truncate(data["request_accept"], 200)), - request_accept_encoding = sanitize_request_header(truncate(data["request_accept_encoding"], 200)), + request_accept = truncate(data["request_accept"], 200), + request_accept_encoding = truncate(data["request_accept_encoding"], 200), request_basic_auth_username = truncate(data["request_basic_auth_username"], 200), - request_connection = sanitize_request_header(truncate(data["request_connection"], 200)), - request_content_type = sanitize_request_header(truncate(data["request_content_type"], 200)), + request_connection = truncate(data["request_connection"], 200), + request_content_type = truncate(data["request_content_type"], 200), request_ip = lowercase_truncate(data["request_ip"], 45), request_ip_city = truncate(data["request_ip_city"], 200), request_ip_country = uppercase_truncate(data["request_ip_country"], 2), request_ip_region = uppercase_truncate(data["request_ip_region"], 2), request_method = uppercase_truncate(data["request_method"], 10), - request_origin = sanitize_request_header(truncate(data["request_origin"], 200)), - request_referer = sanitize_request_header(truncate(data["request_referer"], 200)), + request_origin = truncate(data["request_origin"], 200), + request_referer = truncate(data["request_referer"], 200), request_size = tonumber(data["request_size"]), - request_host = sanitize_request_header(lowercase_truncate(data["request_url_host"], 200)), + request_host = lowercase_truncate(data["request_url_host"], 200), request_path = truncate(data["request_url_path"], 4000), request_url_hierarchy_level0 = truncate(data["request_url_hierarchy_level0"], 200), request_url_hierarchy_level1 = truncate(data["request_url_hierarchy_level1"], 200), @@ -320,7 +310,7 @@ function _M.normalized_data(data) request_url_hierarchy_level6 = truncate(data["request_url_hierarchy_level6"], 200), request_url_query = truncate(data["request_url_query"], 4000), request_scheme = lowercase_truncate(data["request_url_scheme"], 10), - request_user_agent = sanitize_request_header(truncate(data["request_user_agent"], 400)), + request_user_agent = truncate(data["request_user_agent"], 400), request_user_agent_family = truncate(data["request_user_agent_family"], 100), request_user_agent_type = truncate(data["request_user_agent_type"], 100), response_age = tonumber(data["response_age"]), diff --git a/test/proxy/logging/test_special_chars.rb b/test/proxy/logging/test_special_chars.rb index 5e439bb8f..c1b4241b0 100644 --- a/test/proxy/logging/test_special_chars.rb +++ b/test/proxy/logging/test_special_chars.rb @@ -3,7 +3,6 @@ class Test::Proxy::Logging::TestSpecialChars < Minitest::Test include ApiUmbrellaTestHelpers::Setup include ApiUmbrellaTestHelpers::Logging - parallelize_me! def setup super @@ -107,7 +106,7 @@ def test_valid_utf8_encoding_in_url_path_url_params_headers end def test_invalid_utf8_encoding_in_url_path_url_params_headers - log_tail = LogTail.new("fluent-bit/current") + log_tail = LogTail.new("nginx/current") # Test various encodings of the ISO-8859-1 pound symbol: £ (but since this # is the ISO-8859-1 version, it's not valid UTF-8). @@ -127,33 +126,12 @@ def test_invalid_utf8_encoding_in_url_path_url_params_headers record = wait_for_log(response)[:hit_source] - log = log_tail.read.encode("UTF-8", invalid: :replace) - # Fluent Bit's UTF-8 handling appears to be different on x86_64 versus - # ARM64. I believe related to this open issue: - # https://github.com/fluent/fluent-bit/issues/7995 - # - # So on ARM64, it complains about UTF-8, which I think is more expected - # since we are sending in invalid encoded data. But on x86-64 systems, this - # doesn't appear to happen. I think we're okay with either behavior, - # really, we mainly just want to make sure things don't crash when - # encountering this type of weird input. - if RUBY_PLATFORM.start_with?("x86_64") - refute_match("invalid UTF-8 bytes found, skipping bytes", log) - else - assert_match("invalid UTF-8 bytes found, skipping bytes", log) - end - # Since the encoding of this string wasn't actually a valid UTF-8 string, # we test situations where it's sent as the raw ISO-8859-1 value, as well # as the UTF-8 replacement character. expected_raw_in_url_path = url_encoded.downcase expected_raw_in_url_query = url_encoded - # See above for differences in platform. - if RUBY_PLATFORM.start_with?("x86_64") - expected_raw_in_header = "\uE0A3" - else - expected_raw_in_header = "" - end + expected_raw_in_header = "\uFFFD" expected_raw_utf8_in_url_path = "%ef%bf%bd" expected_raw_utf8_in_url_query = "%EF%BF%BD" expected_raw_utf8_in_header = Base64.decode64("77+9").force_encoding("utf-8") @@ -194,6 +172,10 @@ def test_invalid_utf8_encoding_in_url_path_url_params_headers assert_equal(base64ed, record["request_referer"]) assert_equal(expected_raw_in_header, record["request_origin"]) assert_equal(expected_raw_utf8_in_header, record["request_accept"]) + + log = log_tail.read_until(/log message contained invalid utf-8, cleaned/) + assert_match(/\[warn\].*log message contained invalid utf-8, original: \{/, log) + assert_match(/\[warn\].*log message contained invalid utf-8, cleaned: \{/, log) end def test_encoded_strings_as_given @@ -295,6 +277,8 @@ def test_slashes_and_backslashes end def test_invalid_quotes + log_tail = LogTail.new("nginx/current") + response = Typhoeus.get("http://127.0.0.1:9080/api/hello", log_http_options.deep_merge({ :headers => { "User-Agent" => Base64.decode64("eyJ1c2VyX2FnZW50IjogImZvbyDAp8CiIGJhciJ9"), @@ -304,13 +288,42 @@ def test_invalid_quotes assert_response_code(200, response) record = wait_for_log(response)[:hit_source] - # CI returns a slightly different response than local dev for some reason. - if record["request_referer"].include?("foo ?? ") - assert_equal("{\"user_agent\": \"foo ?? bar\"}", record["request_referer"]) - assert_equal("{\"user_agent\": \"foo ?? bar\"}", record["request_user_agent"]) - else - assert_equal("{\"user_agent\": \"foo \xC0\xA7?? bar\"}", record["request_referer"]) - assert_equal("{\"user_agent\": \"foo \xC0\xA7?? bar\"}", record["request_user_agent"]) - end + assert_equal("{\"user_agent\": \"foo \uFFFD bar\"}", record["request_referer"]) + assert_equal("{\"user_agent\": \"foo \uFFFD bar\"}", record["request_user_agent"]) + + response = Typhoeus.get("http://127.0.0.1:9080/api/hello", log_http_options.deep_merge({ + :headers => { + "X-Api-Key" => Base64.decode64("eyJ1c2VyX2FnZW50IjogImZvbyDAp8CiIGJhciJ9"), + }, + })) + assert_response_code(403, response) + + record = wait_for_log(response)[:hit_source] + assert_equal("{\"user_agent\": \"foo \uFFFD bar\"}", record["api_key"]) + + log = log_tail.read_until(/log message contained invalid utf-8, cleaned/) + assert_match(/\[warn\].*log message contained invalid utf-8, original: \{/, log) + assert_match(/\[warn\].*log message contained invalid utf-8, cleaned: \{/, log) + end + + def test_utf8_json_quoting + log_tail = LogTail.new("nginx/current") + + response = Typhoeus.get("http://127.0.0.1:9080/api/hello", log_http_options.deep_merge({ + :headers => { + "User-Agent" => "\" \u0022 \\u0022 %22 " " \u201D", + "Referer" => "\" \u0022 \\u0022 %22 " " \u201D", + "X-Api-Key" => "\" \u0022 \\u0022 %22 " " \u201D", + }, + })) + assert_response_code(403, response) + + record = wait_for_log(response)[:hit_source] + assert_equal("\" \" \\u0022 %22 " " \u201D", record["request_referer"]) + assert_equal("\" \" \\u0022 %22 " " \u201D", record["request_user_agent"]) + assert_equal("\" \" \\u0022 %22 " " \u201D", record["api_key"]) + + log = log_tail.read + refute_match("log message contained invalid utf-8", log) end end diff --git a/test/support/log_tail.rb b/test/support/log_tail.rb index 5a4cc2cda..0dc133319 100644 --- a/test/support/log_tail.rb +++ b/test/support/log_tail.rb @@ -11,7 +11,7 @@ def read output = nil File.open(@path) do |file| file.seek(@pos) - output = file.read + output = file.read.encode("UTF-8", invalid: :replace) @pos = file.pos end @@ -23,7 +23,7 @@ def read_until(regex, timeout: 5) begin Timeout.timeout(timeout) do loop do - output << self.read + output << self.read.encode("UTF-8", invalid: :replace) break if output.match(regex) sleep 0.1