Skip to content

Commit

Permalink
SWDEV-489158: Optimizing counter collection performance (#1150)
Browse files Browse the repository at this point in the history
* SWDEV-489158: Optimizing counter collection performance

* Static initializer fix

* adding sched_yield+sleep
  • Loading branch information
ApoKalipse-V authored Oct 24, 2024
1 parent 42765c3 commit aef1889
Showing 1 changed file with 17 additions and 0 deletions.
17 changes: 17 additions & 0 deletions source/lib/rocprofiler-sdk/hsa/queue.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,14 @@ namespace hsa
{
namespace
{
static std::atomic<int64_t>&
get_balanced_signal_slots()
{
constexpr int64_t NUM_SIGNALS = 16;
static auto*& atomic = common::static_object<std::atomic<int64_t>>::construct(NUM_SIGNALS);
return *atomic;
}

template <typename DomainT, typename... Args>
inline bool
context_filter(const context::context* ctx, DomainT domain, Args... args)
Expand Down Expand Up @@ -106,6 +114,8 @@ AsyncSignalHandler(hsa_signal_value_t /*signal_v*/, void* data)
return false;
}

get_balanced_signal_slots().fetch_add(1);

auto& queue_info_session = *static_cast<Queue::queue_info_session_t*>(data);
auto dispatch_time = kernel_dispatch::get_dispatch_time(queue_info_session);

Expand Down Expand Up @@ -342,6 +352,13 @@ WriteInterceptor(const void* packets,
thr_id,
ROCPROFILER_EXTERNAL_CORRELATION_REQUEST_KERNEL_DISPATCH);

// If there is a lot of contention for HSA signals, then schedule out the thread
if(get_balanced_signal_slots().fetch_sub(1) <= 0)
{
sched_yield();
std::this_thread::sleep_for(std::chrono::microseconds(1));
}

// Stores the instrumentation pkt (i.e. AQL packets for counter collection)
// along with an ID of the client we got the packet from (this will be returned via
// completed_cb_t)
Expand Down

0 comments on commit aef1889

Please sign in to comment.