Skip to content

Commit

Permalink
Performance optimize BFS (including direction optimizing BFS implemen…
Browse files Browse the repository at this point in the history
…tation, mainly for single-GPU) (#4527)

This PR includes multiple performance optimizations to improve BFS performance (including direction optimizing BFS for symmetric graphs).

* Implement direction optimizing BFS
* Add the transform_reduce_v_frontier_outgoing_e_by_src primitive (this resembles transform_reduce_v_frontier_outgoing_e_by_dst but performs reduction based on src)
* Update the fill_edge_src|dst_property to take the fill value as the last input parameter (to be consistent with thrust::fill or std::fill)
* Update the fill_edge_src|dst_property to take a vertex list to fill only a subset of vertex property values.
* Update the update_edge_src|dst_property primitive to take a mutable view (to be consistent with other primitives)
* Update the vertex_frontier bucket to take a raft::device_span (supports creation of non-owning bucket, this saves a copy operation in some cases)
* Fix compiler warnings (cuda 12.4)
* Few updates in code cosmetics.

Authors:
  - Seunghwa Kang (https://github.com/seunghwak)

Approvers:
  - Chuck Hastings (https://github.com/ChuckHastings)
  - Joseph Nke (https://github.com/jnke2016)

URL: #4527
  • Loading branch information
seunghwak authored Jul 10, 2024
1 parent e299a59 commit b55c279
Show file tree
Hide file tree
Showing 57 changed files with 2,147 additions and 842 deletions.
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2022-2023, NVIDIA CORPORATION.
* Copyright (c) 2022-2024, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -127,7 +127,7 @@ class edge_partition_edge_property_device_view_t {
value_t>
atomic_add(edge_t offset, value_t val) const
{
cugraph::atomic_add(value_first_ + offset, val);
return cugraph::atomic_add(value_first_ + offset, val);
}

template <typename Iter = ValueIterator>
Expand All @@ -154,7 +154,7 @@ class edge_partition_edge_property_device_view_t {
value_t>
elementwise_atomic_min(edge_t offset, value_t val) const
{
cugraph::elementwise_atomic_min(value_first_ + offset, val);
return cugraph::elementwise_atomic_min(value_first_ + offset, val);
}

template <typename Iter = ValueIterator, typename T = value_t>
Expand All @@ -164,7 +164,7 @@ class edge_partition_edge_property_device_view_t {
value_t>
elementwise_atomic_max(edge_t offset, value_t val) const
{
cugraph::elementwise_atomic_max(value_first_ + offset, val);
return cugraph::elementwise_atomic_max(value_first_ + offset, val);
}

private:
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2021-2023, NVIDIA CORPORATION.
* Copyright (c) 2021-2024, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -133,7 +133,7 @@ class edge_partition_endpoint_property_device_view_t {
atomic_add(vertex_t offset, value_t val) const
{
auto val_offset = value_offset(offset);
cugraph::atomic_add(value_first_ + val_offset, val);
return cugraph::atomic_add(value_first_ + val_offset, val);
}

template <typename Iter = ValueIterator>
Expand Down Expand Up @@ -162,7 +162,7 @@ class edge_partition_endpoint_property_device_view_t {
elementwise_atomic_min(vertex_t offset, value_t val) const
{
auto val_offset = value_offset(offset);
cugraph::elementwise_atomic_min(value_first_ + val_offset, val);
return cugraph::elementwise_atomic_min(value_first_ + val_offset, val);
}

template <typename Iter = ValueIterator, typename T = value_t>
Expand All @@ -173,7 +173,7 @@ class edge_partition_endpoint_property_device_view_t {
elementwise_atomic_max(vertex_t offset, value_t val) const
{
auto val_offset = value_offset(offset);
cugraph::elementwise_atomic_max(value_first_ + val_offset, val);
return cugraph::elementwise_atomic_max(value_first_ + val_offset, val);
}

private:
Expand Down
45 changes: 29 additions & 16 deletions cpp/include/cugraph/utilities/atomic_ops.cuh
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2020-2023, NVIDIA CORPORATION.
* Copyright (c) 2020-2024, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand All @@ -15,6 +15,7 @@
*/
#pragma once

#include <cugraph/utilities/packed_bool_utils.hpp>
#include <cugraph/utilities/thrust_tuple_utils.hpp>

#include <raft/util/device_atomics.cuh>
Expand Down Expand Up @@ -112,7 +113,7 @@ __device__
T>
atomic_and(Iterator iter, T value)
{
detail::thrust_tuple_atomic_and(
return detail::thrust_tuple_atomic_and(
iter, value, std::make_index_sequence<thrust::tuple_size<T>::value>{});
}

Expand Down Expand Up @@ -140,7 +141,7 @@ __device__
T>
atomic_or(Iterator iter, T value)
{
detail::thrust_tuple_atomic_or(
return detail::thrust_tuple_atomic_or(
iter, value, std::make_index_sequence<thrust::tuple_size<T>::value>{});
}

Expand All @@ -155,22 +156,22 @@ template <typename Iterator, typename T>
__device__
std::enable_if_t<std::is_arithmetic_v<T> &&
std::is_same_v<typename thrust::iterator_traits<Iterator>::value_type, T>,
void>
T>
atomic_add(Iterator iter, T value)
{
atomicAdd(&(thrust::raw_reference_cast(*iter)), value);
return atomicAdd(&(thrust::raw_reference_cast(*iter)), value);
}

template <typename Iterator, typename T>
__device__
std::enable_if_t<is_thrust_tuple<typename thrust::iterator_traits<Iterator>::value_type>::value &&
is_thrust_tuple<T>::value,
void>
T>
atomic_add(Iterator iter, T value)
{
static_assert(thrust::tuple_size<typename thrust::iterator_traits<Iterator>::value_type>::value ==
thrust::tuple_size<T>::value);
detail::thrust_tuple_atomic_add(
return detail::thrust_tuple_atomic_add(
iter, value, std::make_index_sequence<thrust::tuple_size<T>::value>{});
}

Expand All @@ -191,7 +192,7 @@ __device__
T>
elementwise_atomic_cas(Iterator iter, T compare, T value)
{
detail::thrust_tuple_elementwise_atomic_cas(
return detail::thrust_tuple_elementwise_atomic_cas(
iter, compare, value, std::make_index_sequence<thrust::tuple_size<T>::value>{});
}

Expand All @@ -206,22 +207,22 @@ template <typename Iterator, typename T>
__device__
std::enable_if_t<std::is_same<typename thrust::iterator_traits<Iterator>::value_type, T>::value &&
std::is_arithmetic<T>::value,
void>
T>
elementwise_atomic_min(Iterator iter, T const& value)
{
atomicMin(&(thrust::raw_reference_cast(*iter)), value);
return atomicMin(&(thrust::raw_reference_cast(*iter)), value);
}

template <typename Iterator, typename T>
__device__
std::enable_if_t<is_thrust_tuple<typename thrust::iterator_traits<Iterator>::value_type>::value &&
is_thrust_tuple<T>::value,
void>
T>
elementwise_atomic_min(Iterator iter, T const& value)
{
static_assert(thrust::tuple_size<typename thrust::iterator_traits<Iterator>::value_type>::value ==
thrust::tuple_size<T>::value);
detail::thrust_tuple_elementwise_atomic_min(
return detail::thrust_tuple_elementwise_atomic_min(
iter, value, std::make_index_sequence<thrust::tuple_size<T>::value>{});
}

Expand All @@ -236,23 +237,35 @@ template <typename Iterator, typename T>
__device__
std::enable_if_t<std::is_same<typename thrust::iterator_traits<Iterator>::value_type, T>::value &&
std::is_arithmetic<T>::value,
void>
T>
elementwise_atomic_max(Iterator iter, T const& value)
{
atomicMax(&(thrust::raw_reference_cast(*iter)), value);
return atomicMax(&(thrust::raw_reference_cast(*iter)), value);
}

template <typename Iterator, typename T>
__device__
std::enable_if_t<is_thrust_tuple<typename thrust::iterator_traits<Iterator>::value_type>::value &&
is_thrust_tuple<T>::value,
void>
T>
elementwise_atomic_max(Iterator iter, T const& value)
{
static_assert(thrust::tuple_size<typename thrust::iterator_traits<Iterator>::value_type>::value ==
thrust::tuple_size<T>::value);
detail::thrust_tuple_elementwise_atomic_max(
return detail::thrust_tuple_elementwise_atomic_max(
iter, value, std::make_index_sequence<thrust::tuple_size<T>::value>{});
}

template <typename Iterator, typename T>
__device__ void packed_bool_atomic_set(Iterator iter, T offset, bool val)
{
auto packed_output_offset = packed_bool_offset(offset);
auto packed_output_mask = packed_bool_mask(offset);
if (val) {
atomicOr(iter + packed_output_offset, packed_output_mask);
} else {
atomicAnd(iter + packed_output_offset, ~packed_output_mask);
}
}

} // namespace cugraph
24 changes: 12 additions & 12 deletions cpp/include/cugraph/utilities/dataframe_buffer.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -40,30 +40,30 @@ auto allocate_dataframe_buffer_tuple_impl(std::index_sequence<Is...>,
buffer_size, stream_view)...);
}

template <typename TupleType, std::size_t... I>
auto get_dataframe_buffer_begin_tuple_impl(std::index_sequence<I...>, TupleType& buffer)
template <typename TupleType, std::size_t... Is>
auto get_dataframe_buffer_begin_tuple_impl(std::index_sequence<Is...>, TupleType& buffer)
{
return thrust::make_zip_iterator(thrust::make_tuple((std::get<I>(buffer).begin())...));
return thrust::make_zip_iterator(thrust::make_tuple((std::get<Is>(buffer).begin())...));
}

template <typename TupleType, std::size_t... I>
auto get_dataframe_buffer_end_tuple_impl(std::index_sequence<I...>, TupleType& buffer)
template <typename TupleType, std::size_t... Is>
auto get_dataframe_buffer_end_tuple_impl(std::index_sequence<Is...>, TupleType& buffer)
{
return thrust::make_zip_iterator(thrust::make_tuple((std::get<I>(buffer).end())...));
return thrust::make_zip_iterator(thrust::make_tuple((std::get<Is>(buffer).end())...));
}

template <typename TupleType, size_t... I>
auto get_dataframe_buffer_cbegin_tuple_impl(std::index_sequence<I...>, TupleType& buffer)
template <typename TupleType, size_t... Is>
auto get_dataframe_buffer_cbegin_tuple_impl(std::index_sequence<Is...>, TupleType& buffer)
{
// thrust::make_tuple instead of std::make_tuple as this is fed to thrust::make_zip_iterator.
return thrust::make_zip_iterator(thrust::make_tuple((std::get<I>(buffer).cbegin())...));
return thrust::make_zip_iterator(thrust::make_tuple((std::get<Is>(buffer).cbegin())...));
}

template <typename TupleType, std::size_t... I>
auto get_dataframe_buffer_cend_tuple_impl(std::index_sequence<I...>, TupleType& buffer)
template <typename TupleType, std::size_t... Is>
auto get_dataframe_buffer_cend_tuple_impl(std::index_sequence<Is...>, TupleType& buffer)
{
// thrust::make_tuple instead of std::make_tuple as this is fed to thrust::make_zip_iterator.
return thrust::make_zip_iterator(thrust::make_tuple((std::get<I>(buffer).cend())...));
return thrust::make_zip_iterator(thrust::make_tuple((std::get<Is>(buffer).cend())...));
}

} // namespace detail
Expand Down
2 changes: 1 addition & 1 deletion cpp/src/c_api/graph_helper_impl.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ edge_property_t<GraphViewType, T> create_constant_edge_property(raft::handle_t c
{
edge_property_t<GraphViewType, T> edge_property(handle, graph_view);

cugraph::fill_edge_property(handle, graph_view, constant_value, edge_property);
cugraph::fill_edge_property(handle, graph_view, edge_property.mutable_view(), constant_value);

return edge_property;
}
Expand Down
28 changes: 15 additions & 13 deletions cpp/src/centrality/betweenness_centrality_impl.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
#include "prims/per_v_transform_reduce_incoming_outgoing_e.cuh"
#include "prims/transform_e.cuh"
#include "prims/transform_reduce_v.cuh"
#include "prims/transform_reduce_v_frontier_outgoing_e_by_dst.cuh"
#include "prims/transform_reduce_v_frontier_outgoing_e_by_src_dst.cuh"
#include "prims/update_edge_src_dst_property.cuh"
#include "prims/update_v_frontier.cuh"
#include "prims/vertex_frontier.cuh"
Expand Down Expand Up @@ -130,8 +130,8 @@ std::tuple<rmm::device_uvector<vertex_t>, rmm::device_uvector<edge_t>> brandes_b
edge_t hop{0};

while (true) {
update_edge_src_property(handle, graph_view, sigmas.begin(), src_sigmas);
update_edge_dst_property(handle, graph_view, distances.begin(), dst_distances);
update_edge_src_property(handle, graph_view, sigmas.begin(), src_sigmas.mutable_view());
update_edge_dst_property(handle, graph_view, distances.begin(), dst_distances.mutable_view());

auto [new_frontier, new_sigma] =
transform_reduce_v_frontier_outgoing_e_by_dst(handle,
Expand Down Expand Up @@ -228,12 +228,12 @@ void accumulate_vertex_results(
handle,
graph_view,
thrust::make_zip_iterator(distances.begin(), sigmas.begin(), deltas.begin()),
src_properties);
src_properties.mutable_view());
update_edge_dst_property(
handle,
graph_view,
thrust::make_zip_iterator(distances.begin(), sigmas.begin(), deltas.begin()),
dst_properties);
dst_properties.mutable_view());

// FIXME: To do this efficiently, I need a version of
// per_v_transform_reduce_outgoing_e that takes a vertex list
Expand Down Expand Up @@ -272,12 +272,12 @@ void accumulate_vertex_results(
handle,
graph_view,
thrust::make_zip_iterator(distances.begin(), sigmas.begin(), deltas.begin()),
src_properties);
src_properties.mutable_view());
update_edge_dst_property(
handle,
graph_view,
thrust::make_zip_iterator(distances.begin(), sigmas.begin(), deltas.begin()),
dst_properties);
dst_properties.mutable_view());

thrust::transform(handle.get_thrust_policy(),
centralities.begin(),
Expand Down Expand Up @@ -323,12 +323,12 @@ void accumulate_edge_results(
handle,
graph_view,
thrust::make_zip_iterator(distances.begin(), sigmas.begin(), deltas.begin()),
src_properties);
src_properties.mutable_view());
update_edge_dst_property(
handle,
graph_view,
thrust::make_zip_iterator(distances.begin(), sigmas.begin(), deltas.begin()),
dst_properties);
dst_properties.mutable_view());

//
// For now this will do a O(E) pass over all edges over the diameter
Expand Down Expand Up @@ -417,12 +417,12 @@ void accumulate_edge_results(
handle,
graph_view,
thrust::make_zip_iterator(distances.begin(), sigmas.begin(), deltas.begin()),
src_properties);
src_properties.mutable_view());
update_edge_dst_property(
handle,
graph_view,
thrust::make_zip_iterator(distances.begin(), sigmas.begin(), deltas.begin()),
dst_properties);
dst_properties.mutable_view());
}
}

Expand Down Expand Up @@ -594,9 +594,11 @@ edge_betweenness_centrality(
if (graph_view.has_edge_mask()) {
auto unmasked_graph_view = graph_view;
unmasked_graph_view.clear_edge_mask();
fill_edge_property(handle, unmasked_graph_view, weight_t{0}, centralities, do_expensive_check);
fill_edge_property(
handle, unmasked_graph_view, centralities.mutable_view(), weight_t{0}, do_expensive_check);
} else {
fill_edge_property(handle, graph_view, weight_t{0}, centralities, do_expensive_check);
fill_edge_property(
handle, graph_view, centralities.mutable_view(), weight_t{0}, do_expensive_check);
}

size_t num_sources = thrust::distance(vertices_begin, vertices_end);
Expand Down
2 changes: 1 addition & 1 deletion cpp/src/centrality/eigenvector_centrality_impl.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,7 @@ rmm::device_uvector<weight_t> eigenvector_centrality(
old_centralities.data());

update_edge_src_property(
handle, pull_graph_view, old_centralities.begin(), edge_src_centralities);
handle, pull_graph_view, old_centralities.begin(), edge_src_centralities.mutable_view());

if (edge_weight_view) {
per_v_transform_reduce_incoming_e(
Expand Down
2 changes: 1 addition & 1 deletion cpp/src/centrality/katz_centrality_impl.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,7 @@ void katz_centrality(
std::swap(new_katz_centralities, old_katz_centralities);

update_edge_src_property(
handle, pull_graph_view, old_katz_centralities, edge_src_katz_centralities);
handle, pull_graph_view, old_katz_centralities, edge_src_katz_centralities.mutable_view());

if (edge_weight_view) {
per_v_transform_reduce_incoming_e(
Expand Down
Loading

0 comments on commit b55c279

Please sign in to comment.