Use make_strings_children in parse_data nested json reader (#12382)

Use `make_strings_children` utility in parse_data nested json reader Addresses part of #12167 Authors: - Karthikeyan (https://github.com/karthikeyann) Approvers: - David Wendt (https://github.com/davidwendt) - Vukasin Milovanovic (https://github.com/vuule) URL: #12382
rapidsai · Jan 10, 2023 · 0b8eb42 · 0b8eb42
1 parent f011c85
commit 0b8eb42
Show file tree

Hide file tree

Showing 2 changed files with 102 additions and 121 deletions.
diff --git a/cpp/include/cudf/io/detail/data_casting.cuh b/cpp/include/cudf/io/detail/data_casting.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,7 +19,9 @@
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_factories.hpp>
+#include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
+#include <cudf/strings/detail/strings_children.cuh>
 #include <cudf/strings/detail/utf8.hpp>
 #include <cudf/types.hpp>
 
@@ -61,12 +63,9 @@ enum class data_casting_result { PARSING_SUCCESS, PARSED_TO_NULL, PARSING_FAILUR
 /**
  * @brief Providing additional information about the type casting result.
  */
-template <typename in_iterator_t, typename out_iterator_t>
 struct data_casting_result_info {
-  // One past the last input element that was parsed
-  in_iterator_t input_parsed_end;
-  // One past the last output element that was written
-  out_iterator_t output_processed_end;
+  // Number of bytes written to output
+  size_type bytes;
   // Whether parsing succeeded, item was parsed to null, or failed
   data_casting_result result;
 };
@@ -128,20 +127,15 @@ __device__ __forceinline__ int32_t parse_unicode_hex(char const* str)
 }
 
 /**
- * @brief Writes the UTF-8 byte sequence to \p out_it and returns the iterator to one past the
- * last item that was written to \p out_it
+ * @brief Writes the UTF-8 byte sequence to \p out_it and returns the number of bytes written to
+ * \p out_it
  */
-template <typename utf8_char_t, typename out_it_t>
-__device__ __forceinline__ out_it_t write_utf8_char(utf8_char_t utf8_chars, out_it_t out_it)
+constexpr size_type write_utf8_char(char_utf8 character, char*& out_it)
 {
-  constexpr size_type MAX_UTF8_BYTES_PER_CODE_POINT = 4;
-  char char_bytes[MAX_UTF8_BYTES_PER_CODE_POINT];
-  auto const num_chars_written = strings::detail::from_char_utf8(utf8_chars, char_bytes);
-
-  for (size_type i = 0; i < MAX_UTF8_BYTES_PER_CODE_POINT; i++) {
-    if (i < num_chars_written) { *out_it++ = char_bytes[i]; }
-  }
-  return out_it;
+  auto const bytes = (out_it == nullptr) ? strings::detail::bytes_in_char_utf8(character)
+                                         : strings::detail::from_char_utf8(character, out_it);
+  if (out_it) out_it += bytes;
+  return bytes;
 }
 
 /**
@@ -150,47 +144,39 @@ __device__ __forceinline__ out_it_t write_utf8_char(utf8_char_t utf8_chars, out_
  *
  * @tparam in_iterator_t A bidirectional input iterator type whose value_type is convertible to
  * char
- * @tparam out_iterator_t A forward output iterator type
  * @param in_begin Iterator to the first item to process
  * @param in_end Iterator to one past the last item to process
- * @param out_it Iterator to the first item to write
+ * @param d_buffer Output character buffer to the first item to write
  * @param options Settings for controlling string processing behavior
- * @return A four-tuple of (in_it_end, out_it_end, set_null, is_invalid), where in_it_end is an
- * iterator to one past the last character from the input that was processed, out_it_end is an
- * iterator to one past the last character that was written, set_null is true if a null literal
- * was read or a parsing error occurred, and is_invalid is true if a parsing error was
- * encountered
+ * @return A struct of (num_bytes_written, parsing_success_result), where num_bytes_written is
+ * the number of bytes written to d_buffer, parsing_success_result is enum value indicating whether
+ * parsing succeeded, item was parsed to null, or failed.
  */
-template <typename in_iterator_t, typename out_iterator_t>
-__device__ __forceinline__ data_casting_result_info<in_iterator_t, out_iterator_t> process_string(
-  in_iterator_t in_begin,
-  in_iterator_t in_end,
-  out_iterator_t out_it,
-  cudf::io::parse_options_view const& options)
+template <typename in_iterator_t>
+__device__ __forceinline__ data_casting_result_info
+process_string(in_iterator_t in_begin,
+               in_iterator_t in_end,
+               char* d_buffer,
+               cudf::io::parse_options_view const& options)
 {
-  auto const num_in_chars = thrust::distance(in_begin, in_end);
-
-  // Check if the value corresponds to the null literal
-  auto const is_null_literal =
-    serialized_trie_contains(options.trie_na, {in_begin, static_cast<std::size_t>(num_in_chars)});
-  if (is_null_literal) { return {in_begin, out_it, data_casting_result::PARSED_TO_NULL}; }
-
-  // Whether in the original JSON this was a string value enclosed in quotes
-  // ({"a":"foo"} vs. {"a":1.23})
-  char const quote_char     = '"';
-  char const backslash_char = '\\';
-
+  int32_t bytes           = 0;
+  const auto num_in_chars = thrust::distance(in_begin, in_end);
   // String values are indicated by keeping the quote character
-  bool const is_string_value =
-    num_in_chars >= 2LL && (*in_begin == quote_char) && (*thrust::prev(in_end) == quote_char);
+  bool const is_string_value = num_in_chars >= 2LL && (*in_begin == options.quotechar) &&
+                               (*thrust::prev(in_end) == options.quotechar);
 
   // Copy literal/numeric value
   if (not is_string_value) {
     while (in_begin != in_end) {
-      *out_it++ = *in_begin++;
+      if (d_buffer) *d_buffer++ = *in_begin;
+      ++in_begin;
+      ++bytes;
     }
-    return {in_begin, out_it, data_casting_result::PARSING_SUCCESS};
+    return {bytes, data_casting_result::PARSING_SUCCESS};
   }
+  // Whether in the original JSON this was a string value enclosed in quotes
+  // ({"a":"foo"} vs. {"a":1.23})
+  char const backslash_char = '\\';
 
   // Escape-flag, set after encountering a backslash character
   bool escape = false;
@@ -206,7 +192,10 @@ __device__ __forceinline__ data_casting_result_info<in_iterator_t, out_iterator_
     // Copy single character to output
     if (!escape) {
       escape = (*in_begin == backslash_char);
-      if (!escape) { *out_it++ = *in_begin; }
+      if (!escape) {
+        if (d_buffer) *d_buffer++ = *in_begin;
+        ++bytes;
+      }
       ++in_begin;
       continue;
     }
@@ -219,13 +208,12 @@ __device__ __forceinline__ data_casting_result_info<in_iterator_t, out_iterator_
     auto escaped_char = get_escape_char(*in_begin);
 
     // We escaped an invalid escape character -> "fail"/null for this item
-    if (escaped_char == NON_ESCAPE_CHAR) {
-      return {in_begin, out_it, data_casting_result::PARSING_FAILURE};
-    }
+    if (escaped_char == NON_ESCAPE_CHAR) { return {bytes, data_casting_result::PARSING_FAILURE}; }
 
     // Regular, single-character escape
     if (escaped_char != UNICODE_SEQ) {
-      *out_it++ = escaped_char;
+      if (d_buffer) *d_buffer++ = escaped_char;
+      ++bytes;
       ++in_begin;
       continue;
     }
@@ -238,13 +226,13 @@ __device__ __forceinline__ data_casting_result_info<in_iterator_t, out_iterator_
     // Make sure that there's at least 4 characters left from the
     // input, which are expected to be hex digits
     if (thrust::distance(in_begin, in_end) < UNICODE_HEX_DIGIT_COUNT) {
-      return {in_begin, out_it, data_casting_result::PARSING_FAILURE};
+      return {bytes, data_casting_result::PARSING_FAILURE};
     }
 
     auto hex_val = parse_unicode_hex(in_begin);
 
     // Couldn't parse hex values from the four-character sequence -> "fail"/null for this item
-    if (hex_val < 0) { return {in_begin, out_it, data_casting_result::PARSING_FAILURE}; }
+    if (hex_val < 0) { return {bytes, data_casting_result::PARSING_FAILURE}; }
 
     // Skip over the four hex digits
     thrust::advance(in_begin, UNICODE_HEX_DIGIT_COUNT);
@@ -269,21 +257,59 @@ __device__ __forceinline__ data_casting_result_info<in_iterator_t, out_iterator_
       uint32_t unicode_code_point = 0x10000 + ((hex_val - UTF16_HIGH_SURROGATE_BEGIN) << 10) +
                                     (hex_low_val - UTF16_LOW_SURROGATE_BEGIN);
       auto utf8_chars = strings::detail::codepoint_to_utf8(unicode_code_point);
-      out_it          = write_utf8_char(utf8_chars, out_it);
+      bytes += write_utf8_char(utf8_chars, d_buffer);
     }
 
     // Just a single \uXXXX sequence
     else {
       auto utf8_chars = strings::detail::codepoint_to_utf8(hex_val);
-      out_it          = write_utf8_char(utf8_chars, out_it);
+      bytes += write_utf8_char(utf8_chars, d_buffer);
     }
   }
 
   // The last character of the input is a backslash -> "fail"/null for this item
-  if (escape) { return {in_begin, out_it, data_casting_result::PARSING_FAILURE}; }
-  return {in_begin, out_it, data_casting_result::PARSING_SUCCESS};
+  if (escape) { return {bytes, data_casting_result::PARSING_FAILURE}; }
+  return {bytes, data_casting_result::PARSING_SUCCESS};
 }
 
+template <typename str_tuple_it>
+struct string_parse {
+  str_tuple_it str_tuples;
+  bitmask_type* null_mask;
+  cudf::io::parse_options_view const options;
+  size_type* d_offsets{};
+  char* d_chars{};
+
+  __device__ void operator()(size_type idx)
+  {
+    if (not bit_is_set(null_mask, idx)) {
+      if (!d_chars) d_offsets[idx] = 0;
+      return;
+    }
+    auto const in_begin     = str_tuples[idx].first;
+    auto const in_end       = in_begin + str_tuples[idx].second;
+    auto const num_in_chars = str_tuples[idx].second;
+
+    // Check if the value corresponds to the null literal
+    auto const is_null_literal =
+      (!d_chars) &&
+      serialized_trie_contains(options.trie_na, {in_begin, static_cast<std::size_t>(num_in_chars)});
+    if (is_null_literal) {
+      clear_bit(null_mask, idx);
+      if (!d_chars) d_offsets[idx] = 0;
+      return;
+    }
+
+    char* d_buffer        = d_chars ? d_chars + d_offsets[idx] : nullptr;
+    auto str_process_info = process_string(in_begin, in_end, d_buffer, options);
+    if (str_process_info.result != data_casting_result::PARSING_SUCCESS) {
+      clear_bit(null_mask, idx);
+      if (!d_chars) d_offsets[idx] = 0;
+    } else {
+      if (!d_chars) d_offsets[idx] = str_process_info.bytes;
+    }
+  }
+};
 /**
  * @brief Parses the data from an iterator of string views, casting it to the given target data type
  *
@@ -307,67 +333,18 @@ std::unique_ptr<column> parse_data(str_tuple_it str_tuples,
 {
   CUDF_FUNC_RANGE();
   if (col_type == cudf::data_type{cudf::type_id::STRING}) {
-    rmm::device_uvector<size_type> offsets(col_size + 1, stream);
-
-    // Compute string sizes of the post-processed strings
-    thrust::for_each_n(rmm::exec_policy(stream),
-                       thrust::make_counting_iterator<size_type>(0),
-                       col_size,
-                       [str_tuples,
-                        sizes     = device_span<size_type>{offsets},
-                        null_mask = static_cast<bitmask_type*>(null_mask.data()),
-                        options] __device__(size_type row) {
-                         // String at current offset is null, e.g., due to omissions
-                         // ([{"b":"foo"},{"a":"foo"}])
-                         if (not bit_is_set(null_mask, row)) {
-                           sizes[row] = 0;
-                           return;
-                         }
-
-                         auto const in_begin = str_tuples[row].first;
-                         auto const in_end   = in_begin + str_tuples[row].second;
-                         auto out_it         = cub::DiscardOutputIterator<>{};
-                         auto const str_process_info =
-                           process_string(in_begin, in_end, out_it, options);
-
-                         // The total number of characters that we're supposed to copy out
-                         auto const num_chars_copied_out =
-                           thrust::distance(out_it, str_process_info.output_processed_end);
-
-                         // If, during parsing, an error occurred or we parsed the null literal ->
-                         // set to null
-                         if (str_process_info.result != data_casting_result::PARSING_SUCCESS) {
-                           sizes[row] = 0;
-                           clear_bit(null_mask, row);
-                         } else {
-                           sizes[row] = num_chars_copied_out;
-                         }
-                       });
-
-    // Compute offsets for the post-processed strings
-    thrust::exclusive_scan(
-      rmm::exec_policy(stream), offsets.begin(), offsets.end(), offsets.begin());
-
-    // Write out post-processed strings (stripping off quotes, replacing escape sequences)
-    rmm::device_uvector<char> chars(offsets.back_element(stream), stream);
-    thrust::for_each_n(rmm::exec_policy(stream),
-                       thrust::make_counting_iterator<size_type>(0),
-                       col_size,
-                       [str_tuples,
-                        chars     = device_span<char>{chars},
-                        offsets   = device_span<size_type>{offsets},
-                        null_mask = static_cast<bitmask_type*>(null_mask.data()),
-                        options] __device__(size_type row) {
-                         if (not bit_is_set(null_mask, row)) { return; }
-
-                         auto const in_begin = str_tuples[row].first;
-                         auto const in_end   = in_begin + str_tuples[row].second;
-                         auto out_it         = &chars[offsets[row]];
-                         process_string(in_begin, in_end, out_it, options);
-                       });
-
+    // this utility calls the functor to build the offsets and chars columns
+    auto [offsets, chars] = cudf::strings::detail::make_strings_children(
+      string_parse<decltype(str_tuples)>{
+        str_tuples, static_cast<bitmask_type*>(null_mask.data()), options},
+      col_size,
+      stream,
+      mr);
+
+    auto null_count =
+      cudf::detail::null_count(static_cast<bitmask_type*>(null_mask.data()), 0, col_size, stream);
     return make_strings_column(
-      col_size, std::move(offsets), std::move(chars), std::move(null_mask));
+      col_size, std::move(offsets), std::move(chars), null_count, std::move(null_mask));
   }
 
   auto out_col = make_fixed_width_column(

diff --git a/cpp/src/io/json/json_column.cu b/cpp/src/io/json/json_column.cu
@@ -567,10 +567,13 @@ std::pair<std::unique_ptr<column>, std::vector<column_name_info>> device_json_co
   rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  auto make_validity =
-    [stream](device_json_column& json_col) -> std::pair<rmm::device_buffer, size_type> {
+  auto validity_size_check = [](device_json_column& json_col) {
     CUDF_EXPECTS(json_col.validity.size() >= bitmask_allocation_size_bytes(json_col.num_rows),
                  "valid_count is too small");
+  };
+  auto make_validity = [stream, validity_size_check](
+                         device_json_column& json_col) -> std::pair<rmm::device_buffer, size_type> {
+    validity_size_check(json_col);
     auto null_count =
       cudf::detail::null_count(json_col.validity.data(), 0, json_col.num_rows, stream);
     // full null_mask is always required for parse_data
@@ -626,11 +629,12 @@ std::pair<std::unique_ptr<column>, std::vector<column_name_info>> device_json_co
         target_type = cudf::io::detail::infer_data_type(
           options.json_view(), d_input, string_ranges_it, col_size, stream);
       }
+      validity_size_check(json_col);
       // Convert strings to the inferred data type
       auto col = experimental::detail::parse_data(string_spans_it,
                                                   col_size,
                                                   target_type,
-                                                  make_validity(json_col).first,
+                                                  json_col.validity.release(),
                                                   options.view(),
                                                   stream,
                                                   mr);