Performance optimization of JSON validation (#16996)

As part of JSON validation, field, value and string tokens are validated. Right now the code has single transform_inclusive_scan. Since this transform functor is a heavy operation, it slows down the entire scan drastically. This PR splits transform and scan in validation. The runtime of validation went from 200ms to 20ms. Also, a few hardcoded string comparisons are moved to trie. Authors: - Karthikeyan (https://github.com/karthikeyann) Approvers: - Nghia Truong (https://github.com/ttnghia) - Vukasin Milovanovic (https://github.com/vuule) - Robert (Bobby) Evans (https://github.com/revans2) URL: #16996
rapidsai · Oct 8, 2024 · 553d8ec · 553d8ec
1 parent cc23474
commit 553d8ec
Showing 1 changed file with 50 additions and 38 deletions.
diff --git a/cpp/src/io/json/process_tokens.cu b/cpp/src/io/json/process_tokens.cu
@@ -22,6 +22,7 @@
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/io/detail/tokenize_json.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
@@ -87,38 +88,41 @@ void validate_token_stream(device_span<char const> d_input,
 {
   CUDF_FUNC_RANGE();
   if (!options.is_strict_validation()) { return; }
+
+  rmm::device_uvector<bool> d_invalid = cudf::detail::make_zeroed_device_uvector_async<bool>(
+    tokens.size(), stream, cudf::get_current_device_resource_ref());
+
   using token_t = cudf::io::json::token_t;
-  cudf::detail::optional_trie trie_na =
-    cudf::detail::create_serialized_trie(options.get_na_values(), stream);
-  auto trie_na_view    = cudf::detail::make_trie_view(trie_na);
+  auto literals = options.get_na_values();
+  literals.emplace_back("null");  // added these too to single trie
+  literals.emplace_back("true");
+  literals.emplace_back("false");
+
+  cudf::detail::optional_trie trie_literals =
+    cudf::detail::create_serialized_trie(literals, stream);
+  cudf::detail::optional_trie trie_nonnumeric = cudf::detail::create_serialized_trie(
+    {"NaN", "Infinity", "+INF", "+Infinity", "-INF", "-Infinity"}, stream);
+
   auto validate_values = cuda::proclaim_return_type<bool>(
     [data                        = d_input.data(),
-     trie_na                     = trie_na_view,
+     trie_literals               = cudf::detail::make_trie_view(trie_literals),
+     trie_nonnumeric             = cudf::detail::make_trie_view(trie_nonnumeric),
      allow_numeric_leading_zeros = options.is_allowed_numeric_leading_zeros(),
      allow_nonnumeric =
        options.is_allowed_nonnumeric_numbers()] __device__(SymbolOffsetT start,
                                                            SymbolOffsetT end) -> bool {
       // This validates an unquoted value. A value must match https://www.json.org/json-en.html
       // but the leading and training whitespace should already have been removed, and is not
       // a string
-      auto c               = data[start];
-      auto is_null_literal = serialized_trie_contains(trie_na, {data + start, end - start});
-      if (is_null_literal) {
-        return true;
-      } else if ('n' == c) {
-        return substr_eq(data, start, end, 4, "null");
-      } else if ('t' == c) {
-        return substr_eq(data, start, end, 4, "true");
-      } else if ('f' == c) {
-        return substr_eq(data, start, end, 5, "false");
-      } else if (allow_nonnumeric && c == 'N') {
-        return substr_eq(data, start, end, 3, "NaN");
-      } else if (allow_nonnumeric && c == 'I') {
-        return substr_eq(data, start, end, 8, "Infinity");
-      } else if (allow_nonnumeric && c == '+') {
-        return substr_eq(data, start, end, 4, "+INF") ||
-               substr_eq(data, start, end, 9, "+Infinity");
-      } else if ('-' == c || c <= '9' && 'c' >= '0') {
+      auto const is_literal = serialized_trie_contains(trie_literals, {data + start, end - start});
+      if (is_literal) { return true; }
+      if (allow_nonnumeric) {
+        auto const is_nonnumeric =
+          serialized_trie_contains(trie_nonnumeric, {data + start, end - start});
+        if (is_nonnumeric) { return true; }
+      }
+      auto c = data[start];
+      if ('-' == c || c <= '9' && 'c' >= '0') {
         // number
         auto num_state = number_state::START;
         for (auto at = start; at < end; at++) {
@@ -140,9 +144,6 @@ void validate_token_stream(device_span<char const> d_input,
                 num_state = number_state::LEADING_ZERO;
               } else if (c >= '1' && c <= '9') {
                 num_state = number_state::WHOLE;
-              } else if (allow_nonnumeric && 'I' == c) {
-                return substr_eq(data, start, end, 4, "-INF") ||
-                       substr_eq(data, start, end, 9, "-Infinity");
               } else {
                 return false;
               }
@@ -273,33 +274,44 @@ void validate_token_stream(device_span<char const> d_input,
 
   auto num_tokens = tokens.size();
   auto count_it   = thrust::make_counting_iterator(0);
-  auto predicate  = [tokens        = tokens.begin(),
-                    token_indices = token_indices.begin(),
-                    validate_values,
-                    validate_strings] __device__(auto i) -> bool {
+  auto predicate  = cuda::proclaim_return_type<bool>([tokens        = tokens.begin(),
+                                                     token_indices = token_indices.begin(),
+                                                     validate_values,
+                                                     validate_strings] __device__(auto i) -> bool {
     if (tokens[i] == token_t::ValueEnd) {
       return !validate_values(token_indices[i - 1], token_indices[i]);
     } else if (tokens[i] == token_t::FieldNameEnd || tokens[i] == token_t::StringEnd) {
       return !validate_strings(token_indices[i - 1], token_indices[i]);
     }
     return false;
-  };
+  });
+
+  auto conditional_invalidout_it =
+    cudf::detail::make_tabulate_output_iterator(cuda::proclaim_return_type<void>(
+      [d_invalid = d_invalid.begin()] __device__(size_type i, bool x) -> void {
+        if (x) { d_invalid[i] = true; }
+      }));
+  thrust::transform(rmm::exec_policy_nosync(stream),
+                    count_it,
+                    count_it + num_tokens,
+                    conditional_invalidout_it,
+                    predicate);
 
   using scan_type            = write_if::scan_type;
   auto conditional_write     = write_if{tokens.begin(), num_tokens};
   auto conditional_output_it = cudf::detail::make_tabulate_output_iterator(conditional_write);
-  auto transform_op          = cuda::proclaim_return_type<scan_type>(
-    [predicate, tokens = tokens.begin()] __device__(auto i) -> scan_type {
-      if (predicate(i)) return {token_t::ErrorBegin, tokens[i] == token_t::LineEnd};
-      return {static_cast<token_t>(tokens[i]), tokens[i] == token_t::LineEnd};
-    });
-  auto binary_op = cuda::proclaim_return_type<scan_type>(
+  auto binary_op             = cuda::proclaim_return_type<scan_type>(
     [] __device__(scan_type prev, scan_type curr) -> scan_type {
       auto op_result = (prev.first == token_t::ErrorBegin ? prev.first : curr.first);
-      return scan_type((curr.second ? curr.first : op_result), prev.second | curr.second);
+      return {(curr.second ? curr.first : op_result), prev.second | curr.second};
+    });
+  auto transform_op = cuda::proclaim_return_type<scan_type>(
+    [d_invalid = d_invalid.begin(), tokens = tokens.begin()] __device__(auto i) -> scan_type {
+      if (d_invalid[i]) return {token_t::ErrorBegin, tokens[i] == token_t::LineEnd};
+      return {static_cast<token_t>(tokens[i]), tokens[i] == token_t::LineEnd};
     });
 
-  thrust::transform_inclusive_scan(rmm::exec_policy(stream),
+  thrust::transform_inclusive_scan(rmm::exec_policy_nosync(stream),
                                    count_it,
                                    count_it + num_tokens,
                                    conditional_output_it,