Performance optimize BFS (including direction optimizing BFS implemen…

…tation, mainly for single-GPU) (#4527) This PR includes multiple performance optimizations to improve BFS performance (including direction optimizing BFS for symmetric graphs). * Implement direction optimizing BFS * Add the transform_reduce_v_frontier_outgoing_e_by_src primitive (this resembles transform_reduce_v_frontier_outgoing_e_by_dst but performs reduction based on src) * Update the fill_edge_src|dst_property to take the fill value as the last input parameter (to be consistent with thrust::fill or std::fill) * Update the fill_edge_src|dst_property to take a vertex list to fill only a subset of vertex property values. * Update the update_edge_src|dst_property primitive to take a mutable view (to be consistent with other primitives) * Update the vertex_frontier bucket to take a raft::device_span (supports creation of non-owning bucket, this saves a copy operation in some cases) * Fix compiler warnings (cuda 12.4) * Few updates in code cosmetics. Authors: - Seunghwa Kang (https://github.com/seunghwak) Approvers: - Chuck Hastings (https://github.com/ChuckHastings) - Joseph Nke (https://github.com/jnke2016) URL: #4527
rapidsai · Jul 10, 2024 · b55c279 · b55c279
1 parent e299a59
commit b55c279
Show file tree

Hide file tree

Showing 57 changed files with 2,147 additions and 842 deletions.
diff --git a/cpp/include/cugraph/edge_partition_edge_property_device_view.cuh b/cpp/include/cugraph/edge_partition_edge_property_device_view.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -127,7 +127,7 @@ class edge_partition_edge_property_device_view_t {
     value_t>
   atomic_add(edge_t offset, value_t val) const
   {
-    cugraph::atomic_add(value_first_ + offset, val);
+    return cugraph::atomic_add(value_first_ + offset, val);
   }
 
   template <typename Iter = ValueIterator>
@@ -154,7 +154,7 @@ class edge_partition_edge_property_device_view_t {
     value_t>
   elementwise_atomic_min(edge_t offset, value_t val) const
   {
-    cugraph::elementwise_atomic_min(value_first_ + offset, val);
+    return cugraph::elementwise_atomic_min(value_first_ + offset, val);
   }
 
   template <typename Iter = ValueIterator, typename T = value_t>
@@ -164,7 +164,7 @@ class edge_partition_edge_property_device_view_t {
     value_t>
   elementwise_atomic_max(edge_t offset, value_t val) const
   {
-    cugraph::elementwise_atomic_max(value_first_ + offset, val);
+    return cugraph::elementwise_atomic_max(value_first_ + offset, val);
   }
 
  private:

diff --git a/cpp/include/cugraph/edge_partition_endpoint_property_device_view.cuh b/cpp/include/cugraph/edge_partition_endpoint_property_device_view.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -133,7 +133,7 @@ class edge_partition_endpoint_property_device_view_t {
   atomic_add(vertex_t offset, value_t val) const
   {
     auto val_offset = value_offset(offset);
-    cugraph::atomic_add(value_first_ + val_offset, val);
+    return cugraph::atomic_add(value_first_ + val_offset, val);
   }
 
   template <typename Iter = ValueIterator>
@@ -162,7 +162,7 @@ class edge_partition_endpoint_property_device_view_t {
   elementwise_atomic_min(vertex_t offset, value_t val) const
   {
     auto val_offset = value_offset(offset);
-    cugraph::elementwise_atomic_min(value_first_ + val_offset, val);
+    return cugraph::elementwise_atomic_min(value_first_ + val_offset, val);
   }
 
   template <typename Iter = ValueIterator, typename T = value_t>
@@ -173,7 +173,7 @@ class edge_partition_endpoint_property_device_view_t {
   elementwise_atomic_max(vertex_t offset, value_t val) const
   {
     auto val_offset = value_offset(offset);
-    cugraph::elementwise_atomic_max(value_first_ + val_offset, val);
+    return cugraph::elementwise_atomic_max(value_first_ + val_offset, val);
   }
 
  private:

diff --git a/cpp/include/cugraph/utilities/atomic_ops.cuh b/cpp/include/cugraph/utilities/atomic_ops.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,6 +15,7 @@
  */
 #pragma once
 
+#include <cugraph/utilities/packed_bool_utils.hpp>
 #include <cugraph/utilities/thrust_tuple_utils.hpp>
 
 #include <raft/util/device_atomics.cuh>
@@ -112,7 +113,7 @@ __device__
                    T>
   atomic_and(Iterator iter, T value)
 {
-  detail::thrust_tuple_atomic_and(
+  return detail::thrust_tuple_atomic_and(
     iter, value, std::make_index_sequence<thrust::tuple_size<T>::value>{});
 }
 
@@ -140,7 +141,7 @@ __device__
                    T>
   atomic_or(Iterator iter, T value)
 {
-  detail::thrust_tuple_atomic_or(
+  return detail::thrust_tuple_atomic_or(
     iter, value, std::make_index_sequence<thrust::tuple_size<T>::value>{});
 }
 
@@ -155,22 +156,22 @@ template <typename Iterator, typename T>
 __device__
   std::enable_if_t<std::is_arithmetic_v<T> &&
                      std::is_same_v<typename thrust::iterator_traits<Iterator>::value_type, T>,
-                   void>
+                   T>
   atomic_add(Iterator iter, T value)
 {
-  atomicAdd(&(thrust::raw_reference_cast(*iter)), value);
+  return atomicAdd(&(thrust::raw_reference_cast(*iter)), value);
 }
 
 template <typename Iterator, typename T>
 __device__
   std::enable_if_t<is_thrust_tuple<typename thrust::iterator_traits<Iterator>::value_type>::value &&
                      is_thrust_tuple<T>::value,
-                   void>
+                   T>
   atomic_add(Iterator iter, T value)
 {
   static_assert(thrust::tuple_size<typename thrust::iterator_traits<Iterator>::value_type>::value ==
                 thrust::tuple_size<T>::value);
-  detail::thrust_tuple_atomic_add(
+  return detail::thrust_tuple_atomic_add(
     iter, value, std::make_index_sequence<thrust::tuple_size<T>::value>{});
 }
 
@@ -191,7 +192,7 @@ __device__
                    T>
   elementwise_atomic_cas(Iterator iter, T compare, T value)
 {
-  detail::thrust_tuple_elementwise_atomic_cas(
+  return detail::thrust_tuple_elementwise_atomic_cas(
     iter, compare, value, std::make_index_sequence<thrust::tuple_size<T>::value>{});
 }
 
@@ -206,22 +207,22 @@ template <typename Iterator, typename T>
 __device__
   std::enable_if_t<std::is_same<typename thrust::iterator_traits<Iterator>::value_type, T>::value &&
                      std::is_arithmetic<T>::value,
-                   void>
+                   T>
   elementwise_atomic_min(Iterator iter, T const& value)
 {
-  atomicMin(&(thrust::raw_reference_cast(*iter)), value);
+  return atomicMin(&(thrust::raw_reference_cast(*iter)), value);
 }
 
 template <typename Iterator, typename T>
 __device__
   std::enable_if_t<is_thrust_tuple<typename thrust::iterator_traits<Iterator>::value_type>::value &&
                      is_thrust_tuple<T>::value,
-                   void>
+                   T>
   elementwise_atomic_min(Iterator iter, T const& value)
 {
   static_assert(thrust::tuple_size<typename thrust::iterator_traits<Iterator>::value_type>::value ==
                 thrust::tuple_size<T>::value);
-  detail::thrust_tuple_elementwise_atomic_min(
+  return detail::thrust_tuple_elementwise_atomic_min(
     iter, value, std::make_index_sequence<thrust::tuple_size<T>::value>{});
 }
 
@@ -236,23 +237,35 @@ template <typename Iterator, typename T>
 __device__
   std::enable_if_t<std::is_same<typename thrust::iterator_traits<Iterator>::value_type, T>::value &&
                      std::is_arithmetic<T>::value,
-                   void>
+                   T>
   elementwise_atomic_max(Iterator iter, T const& value)
 {
-  atomicMax(&(thrust::raw_reference_cast(*iter)), value);
+  return atomicMax(&(thrust::raw_reference_cast(*iter)), value);
 }
 
 template <typename Iterator, typename T>
 __device__
   std::enable_if_t<is_thrust_tuple<typename thrust::iterator_traits<Iterator>::value_type>::value &&
                      is_thrust_tuple<T>::value,
-                   void>
+                   T>
   elementwise_atomic_max(Iterator iter, T const& value)
 {
   static_assert(thrust::tuple_size<typename thrust::iterator_traits<Iterator>::value_type>::value ==
                 thrust::tuple_size<T>::value);
-  detail::thrust_tuple_elementwise_atomic_max(
+  return detail::thrust_tuple_elementwise_atomic_max(
     iter, value, std::make_index_sequence<thrust::tuple_size<T>::value>{});
 }
 
+template <typename Iterator, typename T>
+__device__ void packed_bool_atomic_set(Iterator iter, T offset, bool val)
+{
+  auto packed_output_offset = packed_bool_offset(offset);
+  auto packed_output_mask   = packed_bool_mask(offset);
+  if (val) {
+    atomicOr(iter + packed_output_offset, packed_output_mask);
+  } else {
+    atomicAnd(iter + packed_output_offset, ~packed_output_mask);
+  }
+}
+
 }  // namespace cugraph
diff --git a/cpp/include/cugraph/utilities/dataframe_buffer.hpp b/cpp/include/cugraph/utilities/dataframe_buffer.hpp
@@ -40,30 +40,30 @@ auto allocate_dataframe_buffer_tuple_impl(std::index_sequence<Is...>,
     buffer_size, stream_view)...);
 }
 
-template <typename TupleType, std::size_t... I>
-auto get_dataframe_buffer_begin_tuple_impl(std::index_sequence<I...>, TupleType& buffer)
+template <typename TupleType, std::size_t... Is>
+auto get_dataframe_buffer_begin_tuple_impl(std::index_sequence<Is...>, TupleType& buffer)
 {
-  return thrust::make_zip_iterator(thrust::make_tuple((std::get<I>(buffer).begin())...));
+  return thrust::make_zip_iterator(thrust::make_tuple((std::get<Is>(buffer).begin())...));
 }
 
-template <typename TupleType, std::size_t... I>
-auto get_dataframe_buffer_end_tuple_impl(std::index_sequence<I...>, TupleType& buffer)
+template <typename TupleType, std::size_t... Is>
+auto get_dataframe_buffer_end_tuple_impl(std::index_sequence<Is...>, TupleType& buffer)
 {
-  return thrust::make_zip_iterator(thrust::make_tuple((std::get<I>(buffer).end())...));
+  return thrust::make_zip_iterator(thrust::make_tuple((std::get<Is>(buffer).end())...));
 }
 
-template <typename TupleType, size_t... I>
-auto get_dataframe_buffer_cbegin_tuple_impl(std::index_sequence<I...>, TupleType& buffer)
+template <typename TupleType, size_t... Is>
+auto get_dataframe_buffer_cbegin_tuple_impl(std::index_sequence<Is...>, TupleType& buffer)
 {
   // thrust::make_tuple instead of std::make_tuple as this is fed to thrust::make_zip_iterator.
-  return thrust::make_zip_iterator(thrust::make_tuple((std::get<I>(buffer).cbegin())...));
+  return thrust::make_zip_iterator(thrust::make_tuple((std::get<Is>(buffer).cbegin())...));
 }
 
-template <typename TupleType, std::size_t... I>
-auto get_dataframe_buffer_cend_tuple_impl(std::index_sequence<I...>, TupleType& buffer)
+template <typename TupleType, std::size_t... Is>
+auto get_dataframe_buffer_cend_tuple_impl(std::index_sequence<Is...>, TupleType& buffer)
 {
   // thrust::make_tuple instead of std::make_tuple as this is fed to thrust::make_zip_iterator.
-  return thrust::make_zip_iterator(thrust::make_tuple((std::get<I>(buffer).cend())...));
+  return thrust::make_zip_iterator(thrust::make_tuple((std::get<Is>(buffer).cend())...));
 }
 
 }  // namespace detail

diff --git a/cpp/src/c_api/graph_helper_impl.cuh b/cpp/src/c_api/graph_helper_impl.cuh
@@ -37,7 +37,7 @@ edge_property_t<GraphViewType, T> create_constant_edge_property(raft::handle_t c
 {
   edge_property_t<GraphViewType, T> edge_property(handle, graph_view);
 
-  cugraph::fill_edge_property(handle, graph_view, constant_value, edge_property);
+  cugraph::fill_edge_property(handle, graph_view, edge_property.mutable_view(), constant_value);
 
   return edge_property;
 }

diff --git a/cpp/src/centrality/betweenness_centrality_impl.cuh b/cpp/src/centrality/betweenness_centrality_impl.cuh
@@ -23,7 +23,7 @@
 #include "prims/per_v_transform_reduce_incoming_outgoing_e.cuh"
 #include "prims/transform_e.cuh"
 #include "prims/transform_reduce_v.cuh"
-#include "prims/transform_reduce_v_frontier_outgoing_e_by_dst.cuh"
+#include "prims/transform_reduce_v_frontier_outgoing_e_by_src_dst.cuh"
 #include "prims/update_edge_src_dst_property.cuh"
 #include "prims/update_v_frontier.cuh"
 #include "prims/vertex_frontier.cuh"
@@ -130,8 +130,8 @@ std::tuple<rmm::device_uvector<vertex_t>, rmm::device_uvector<edge_t>> brandes_b
   edge_t hop{0};
 
   while (true) {
-    update_edge_src_property(handle, graph_view, sigmas.begin(), src_sigmas);
-    update_edge_dst_property(handle, graph_view, distances.begin(), dst_distances);
+    update_edge_src_property(handle, graph_view, sigmas.begin(), src_sigmas.mutable_view());
+    update_edge_dst_property(handle, graph_view, distances.begin(), dst_distances.mutable_view());
 
     auto [new_frontier, new_sigma] =
       transform_reduce_v_frontier_outgoing_e_by_dst(handle,
@@ -228,12 +228,12 @@ void accumulate_vertex_results(
     handle,
     graph_view,
     thrust::make_zip_iterator(distances.begin(), sigmas.begin(), deltas.begin()),
-    src_properties);
+    src_properties.mutable_view());
   update_edge_dst_property(
     handle,
     graph_view,
     thrust::make_zip_iterator(distances.begin(), sigmas.begin(), deltas.begin()),
-    dst_properties);
+    dst_properties.mutable_view());
 
   // FIXME: To do this efficiently, I need a version of
   //   per_v_transform_reduce_outgoing_e that takes a vertex list
@@ -272,12 +272,12 @@ void accumulate_vertex_results(
       handle,
       graph_view,
       thrust::make_zip_iterator(distances.begin(), sigmas.begin(), deltas.begin()),
-      src_properties);
+      src_properties.mutable_view());
     update_edge_dst_property(
       handle,
       graph_view,
       thrust::make_zip_iterator(distances.begin(), sigmas.begin(), deltas.begin()),
-      dst_properties);
+      dst_properties.mutable_view());
 
     thrust::transform(handle.get_thrust_policy(),
                       centralities.begin(),
@@ -323,12 +323,12 @@ void accumulate_edge_results(
     handle,
     graph_view,
     thrust::make_zip_iterator(distances.begin(), sigmas.begin(), deltas.begin()),
-    src_properties);
+    src_properties.mutable_view());
   update_edge_dst_property(
     handle,
     graph_view,
     thrust::make_zip_iterator(distances.begin(), sigmas.begin(), deltas.begin()),
-    dst_properties);
+    dst_properties.mutable_view());
 
   //
   //   For now this will do a O(E) pass over all edges over the diameter
@@ -417,12 +417,12 @@ void accumulate_edge_results(
       handle,
       graph_view,
       thrust::make_zip_iterator(distances.begin(), sigmas.begin(), deltas.begin()),
-      src_properties);
+      src_properties.mutable_view());
     update_edge_dst_property(
       handle,
       graph_view,
       thrust::make_zip_iterator(distances.begin(), sigmas.begin(), deltas.begin()),
-      dst_properties);
+      dst_properties.mutable_view());
   }
 }
 
@@ -594,9 +594,11 @@ edge_betweenness_centrality(
   if (graph_view.has_edge_mask()) {
     auto unmasked_graph_view = graph_view;
     unmasked_graph_view.clear_edge_mask();
-    fill_edge_property(handle, unmasked_graph_view, weight_t{0}, centralities, do_expensive_check);
+    fill_edge_property(
+      handle, unmasked_graph_view, centralities.mutable_view(), weight_t{0}, do_expensive_check);
   } else {
-    fill_edge_property(handle, graph_view, weight_t{0}, centralities, do_expensive_check);
+    fill_edge_property(
+      handle, graph_view, centralities.mutable_view(), weight_t{0}, do_expensive_check);
   }
 
   size_t num_sources = thrust::distance(vertices_begin, vertices_end);

diff --git a/cpp/src/centrality/eigenvector_centrality_impl.cuh b/cpp/src/centrality/eigenvector_centrality_impl.cuh
@@ -98,7 +98,7 @@ rmm::device_uvector<weight_t> eigenvector_centrality(
                  old_centralities.data());
 
     update_edge_src_property(
-      handle, pull_graph_view, old_centralities.begin(), edge_src_centralities);
+      handle, pull_graph_view, old_centralities.begin(), edge_src_centralities.mutable_view());
 
     if (edge_weight_view) {
       per_v_transform_reduce_incoming_e(

diff --git a/cpp/src/centrality/katz_centrality_impl.cuh b/cpp/src/centrality/katz_centrality_impl.cuh
@@ -107,7 +107,7 @@ void katz_centrality(
     std::swap(new_katz_centralities, old_katz_centralities);
 
     update_edge_src_property(
-      handle, pull_graph_view, old_katz_centralities, edge_src_katz_centralities);
+      handle, pull_graph_view, old_katz_centralities, edge_src_katz_centralities.mutable_view());
 
     if (edge_weight_view) {
       per_v_transform_reduce_incoming_e(