Skip to content

Commit

Permalink
[Runtime] Optimize CUDA multi-stream by merging copy stream into comp…
Browse files Browse the repository at this point in the history
…ute stream. (#557)

Use MERGE_COMPUTE_COPY_STREAM=true to enable merged stream. It could reduce compute stream and copy stream synchronize cost,  and improve performance when multi-stream(multi compute stream) enabled.
  • Loading branch information
shanshanpt authored Nov 17, 2022
1 parent 63f6b9a commit 05f76dc
Show file tree
Hide file tree
Showing 16 changed files with 200 additions and 89 deletions.
18 changes: 17 additions & 1 deletion tensorflow/core/common_runtime/direct_session.cc
Original file line number Diff line number Diff line change
Expand Up @@ -766,6 +766,10 @@ DirectSession::DirectSession(const SessionOptions& options,
LOG(INFO) << "Current DirectSession " << this << " will be pinned to core: " << msg;
thread_pools_[0].first->SetThreadPoolAffinity(cpuset);
}

tensorflow::ReadBoolFromEnvVar("MERGE_COMPUTE_COPY_STREAM",
/*default_val=*/false,
&merge_compute_and_copy_stream_);
}

DirectSession::~DirectSession() {
Expand Down Expand Up @@ -961,8 +965,16 @@ Status DirectSession::RunInternal(

// Start parallel Executors.
const size_t num_executors = executors_and_keys->items.size();
// ref_send_inputs will be filled during execute graph.
std::vector<TensorReference*> ref_send_inputs;
ExecutorBarrier* barrier = new ExecutorBarrier(
num_executors, run_state.rendez, [&run_state](const Status& ret) {
num_executors, run_state.rendez,
[&run_state, &ref_send_inputs](const Status& ret) {
VLOG(2) << "To unref buffer size: " << ref_send_inputs.size();
for (auto& ref : ref_send_inputs) {
ref->Unref();
}
ref_send_inputs.clear();
{
mutex_lock l(run_state.mu_);
run_state.status.Update(ret);
Expand Down Expand Up @@ -994,6 +1006,10 @@ Status DirectSession::RunInternal(
args.executor_policy = ExecutorPolicy::USE_NORMAL_EXECUTOR;
}

args.ref_send_inputs_mu_ptr = std::make_unique<mutex>();
args.ref_send_inputs_ptr = &ref_send_inputs;
args.merge_compute_and_copy_stream = merge_compute_and_copy_stream_;

const bool do_trace = (run_options.trace_level() > RunOptions::NO_TRACE);

bool update_cost_model = false;
Expand Down
4 changes: 4 additions & 0 deletions tensorflow/core/common_runtime/direct_session.h
Original file line number Diff line number Diff line change
Expand Up @@ -447,6 +447,10 @@ class DirectSession : public Session {
int multi_stream_num_ = 0;
ResourceMgr* multi_stream_shared_rmgr_ = nullptr;

// User decide whether use compute stream as copy stream
// by set environment 'MERGE_COMPUTE_COPY_STREAM'
bool merge_compute_and_copy_stream_ = false;

TF_DISALLOW_COPY_AND_ASSIGN(DirectSession);

// EXPERIMENTAL: debugger (tfdbg) related
Expand Down
43 changes: 33 additions & 10 deletions tensorflow/core/common_runtime/executor.cc
Original file line number Diff line number Diff line change
Expand Up @@ -395,6 +395,11 @@ class ExecutorState {
bool finish_when_deferred_ops_done_ TF_GUARDED_BY(num_deferred_ops_mu_) =
false;

// Ref Tensors of input of send op
mutex* ref_send_inputs_mu_ptr_;
std::vector<TensorReference*>* ref_send_inputs_ptr_;
bool merge_compute_and_copy_stream_;

mutex mu_;
Status status_ TF_GUARDED_BY(mu_);
};
Expand Down Expand Up @@ -489,8 +494,8 @@ struct SortTaggedNode {
SortTaggedNode(const std::vector<int64>* immutable_accumulative_cost) :
immutable_accumulative_cost_(immutable_accumulative_cost) {}
bool operator()(const TaggedNode& n1, const TaggedNode& n2) {
return (*immutable_accumulative_cost_)[n1.get_node_item().node_id] >
(*immutable_accumulative_cost_)[n2.get_node_item().node_id];
return (*immutable_accumulative_cost_)[n1.get_node_item().node->id()] >
(*immutable_accumulative_cost_)[n2.get_node_item().node->id()];
}
const std::vector<int64>* immutable_accumulative_cost_;
};
Expand Down Expand Up @@ -537,7 +542,10 @@ ExecutorState<PropagatorStateType>::ExecutorState(
sync_on_finish_(args.sync_on_finish),
executor_policy_(args.executor_policy),
propagator_(immutable_state, step_id_, vlog_),
num_outstanding_ops_(0) {
num_outstanding_ops_(0),
ref_send_inputs_mu_ptr_(args.ref_send_inputs_mu_ptr.get()),
ref_send_inputs_ptr_(args.ref_send_inputs_ptr),
merge_compute_and_copy_stream_(args.merge_compute_and_copy_stream) {
// TODO: FIXME Consider function lib executor later
//if (args.cost_runner == nullptr) {
// LOG(FATAL) << "cost_runner is nullptr, please check the args.";
Expand Down Expand Up @@ -668,16 +676,31 @@ Status ExecutorState<PropagatorStateType>::ProcessSync(
NodeExecStatsInterface* stats) {
Status s;
OpKernelContext ctx(params, item.num_outputs);
nodestats::SetOpStart(stats);

ExecutorInternal::KernelStatsInfo kernel_stat_buffer;
kernel_stats_->StartCollectOp(&item, &kernel_stat_buffer);

OpKernel* op_kernel = item.kernel;
Device* device = immutable_state_.params().device;
if (item.virtual_device.get() != nullptr) {
device = item.virtual_device.get();
}

if (merge_compute_and_copy_stream_ &&
(op_kernel->type_string() == "_HostSend" ||
(op_kernel->type_string() == "_Send" &&
device->parsed_name().type == "CPU")) &&
item.node->attrs().Find("recv_device")->s().find("GPU") != string::npos &&
(*params->inputs)[0].tensor->NumElements() > 0) {
CHECK(item.num_inputs == 1); // send op allow one tensor
TensorReference* ref = new TensorReference(*((*params->inputs)[0].tensor));
{
mutex_lock l(*ref_send_inputs_mu_ptr_);
ref_send_inputs_ptr_->push_back(std::move(ref));
}
}

nodestats::SetOpStart(stats);

ExecutorInternal::KernelStatsInfo kernel_stat_buffer;
kernel_stats_->StartCollectOp(&item, &kernel_stat_buffer);

const bool is_expensive = kernel_stats_->IsExpensive(item);

if (TF_PREDICT_FALSE(MightTrace(item, event_collector_))) {
Expand Down Expand Up @@ -748,7 +771,7 @@ void ExecutorState<PropagatorStateType>::ProcessAsync(
Status s = ProcessOutputs(*state->item, &state->ctx, outputs.data(), stats);
nodestats::SetMemory(stats, &state->ctx);
if (vlog_) {
VLOG(2) << "Async kernel done: " << state->item->node_id << " step "
VLOG(2) << "Async kernel done: " << state->item->node->id() << " step "
<< step_id_ << " " << SummarizeNodeDef(state->item->kernel->def())
<< (state->tagged_node.get_is_dead() ? " is dead" : "")
<< " device: " << device->name();
Expand Down Expand Up @@ -898,7 +921,7 @@ void ExecutorState<PropagatorStateType>::BatchProcess(std::vector<TaggedNode> no
tagged_node = inline_ready.front();
inline_ready.pop_front();
const NodeItem& item = tagged_node.get_node_item();
const int id = item.node_id;
const int id = item.node->id();

propagator_.MaybeMarkStarted(tagged_node);

Expand Down
6 changes: 6 additions & 0 deletions tensorflow/core/common_runtime/executor.h
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,12 @@ class Executor {
CostRunner cost_runner = nullptr;

ExecutorPolicy executor_policy = ExecutorPolicy::USE_NORMAL_EXECUTOR;

// store refs to cpu tensors that will be sent to gpu,
// and release them when the session run finishes.
std::unique_ptr<mutex> ref_send_inputs_mu_ptr;
std::vector<TensorReference*>* ref_send_inputs_ptr = nullptr;
bool merge_compute_and_copy_stream = false;
};
typedef std::function<void(const Status&)> DoneCallback;
virtual void RunAsync(const Args& args, DoneCallback done) = 0;
Expand Down
47 changes: 36 additions & 11 deletions tensorflow/core/common_runtime/gpu/gpu_device.cc
Original file line number Diff line number Diff line change
Expand Up @@ -698,17 +698,42 @@ Status BaseGPUDevice::MaybeCopyTensorToGPU(
return err;
}

StatusCallback wrapped_done = std::bind(
[to, copy](StatusCallback done_,
// Begin unbound arguments.
const Status& s) {
if (s.ok()) {
*to = std::move(*copy);
}
delete copy;
done_(s);
},
std::move(done), std::placeholders::_1);
StatusCallback wrapped_done;
if (GPUUtil::MergeComputeAndCopyStream()) {
TensorReference input_ref(from);
auto recv_host_to_device_stream = device_contexts_[0]->stream();
auto event_mgr = em_;
wrapped_done = std::bind(
[to, copy, recv_host_to_device_stream, event_mgr, input_ref](
StatusCallback done_,
// Begin unbound arguments.
const Status& s) {
event_mgr->ThenExecute(
recv_host_to_device_stream,
[to, copy, recv_host_to_device_stream, done_, &s, input_ref]() {
input_ref.Unref();
if (!recv_host_to_device_stream->ok()) {
LOG(FATAL) << "CPU->GPU Memcpy failed";
}
*to = std::move(*copy);
delete copy;
done_(s);
});
},
std::move(done), std::placeholders::_1);
} else {
wrapped_done = std::bind(
[to, copy](StatusCallback done_,
// Begin unbound arguments.
const Status& s) {
if (s.ok()) {
*to = std::move(*copy);
}
delete copy;
done_(s);
},
std::move(done), std::placeholders::_1);
}

tracing::ScopedAnnotation annotation("MakeTensorFromProto");
device_contexts_[0]->CopyCPUTensorToDevice(
Expand Down
98 changes: 66 additions & 32 deletions tensorflow/core/common_runtime/gpu/gpu_util.cc
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ limitations under the License.
#include "tensorflow/core/platform/stream_executor.h"
#include "tensorflow/core/platform/tensor_coding.h"
#include "tensorflow/core/platform/tracing.h"
#include "tensorflow/core/util/env_var.h"
#include "tensorflow/core/util/util.h"

// IMPLEMENTATION NOTE:
Expand Down Expand Up @@ -111,6 +112,23 @@ void* GetBase(const Tensor* src) {

void* GetBase(Tensor* dst) { return DMAHelper::base(dst); }

/*static*/
bool GPUUtil::MergeComputeAndCopyStream() {
static bool merge = false;
static bool check_setting = true;
if (check_setting) {
static mutex mu;
mutex_lock l(mu);
if (check_setting) {
tensorflow::ReadBoolFromEnvVar("MERGE_COMPUTE_COPY_STREAM",
/*default_val=*/false, &merge);
check_setting = false;
}
}

return merge;
}

/*static*/
void GPUUtil::SetProtoFromGPU(const Tensor& tensor, Device* dev,
const DeviceContext* device_context,
Expand Down Expand Up @@ -273,16 +291,22 @@ void GPUUtil::CopyGPUTensorToCPU(Device* gpu_device,
return;
}

auto send_device_to_host_stream =
static_cast<const GPUDeviceContext*>(device_context)
->device_to_host_stream();
if (send_device_to_host_stream == nullptr) {
done(errors::Internal("No send gpu copy-out-stream is available."));
return;
}
// Wait for the sender's main stream to make sure the data are available.
if (send_device_to_host_stream != send_stream) {
send_device_to_host_stream->ThenWaitFor(send_stream);
se::Stream* send_device_to_host_stream = nullptr;
if (MergeComputeAndCopyStream()) {
send_device_to_host_stream = send_stream;
} else {
send_device_to_host_stream =
static_cast<const GPUDeviceContext*>(device_context)
->device_to_host_stream();
if (send_device_to_host_stream == nullptr) {
done(errors::Internal("No send gpu copy-out-stream is available."));
return;
}

// Wait for the sender's main stream to make sure the data are available.
if (send_device_to_host_stream != send_stream) {
send_device_to_host_stream->ThenWaitFor(send_stream);
}
}

const int64 total_bytes = gpu_tensor->TotalBytes();
Expand Down Expand Up @@ -320,17 +344,22 @@ void GPUUtil::CopyCPUTensorToGPU(const Tensor* cpu_tensor,
return;
}

auto recv_host_to_device_stream =
static_cast<const GPUDeviceContext*>(device_context)
->host_to_device_stream();
if (recv_host_to_device_stream == nullptr) {
done(errors::Internal("No send gpu copy-out-stream is available."));
return;
}
// Wait for the recv-stream to make sure the buffer is truly available.
if (sync_dst_compute) {
if (recv_host_to_device_stream != recv_stream) {
recv_host_to_device_stream->ThenWaitFor(recv_stream);
se::Stream* recv_host_to_device_stream = nullptr;
if (MergeComputeAndCopyStream()) {
recv_host_to_device_stream = recv_stream;
} else {
recv_host_to_device_stream =
static_cast<const GPUDeviceContext*>(device_context)
->host_to_device_stream();
if (recv_host_to_device_stream == nullptr) {
done(errors::Internal("No send gpu copy-out-stream is available."));
return;
}
// Wait for the recv-stream to make sure the buffer is truly available.
if (sync_dst_compute) {
if (recv_host_to_device_stream != recv_stream) {
recv_host_to_device_stream->ThenWaitFor(recv_stream);
}
}
}

Expand All @@ -342,17 +371,22 @@ void GPUUtil::CopyCPUTensorToGPU(const Tensor* cpu_tensor,
DeviceMemoryBase gpu_dst_ptr(dst_ptr, total_bytes);
recv_host_to_device_stream->ThenMemcpy(&gpu_dst_ptr, src_ptr, total_bytes);
}
// Use of cpu_tensor may outlive stack scope, so keep a ref.
TensorReference input_ref(*cpu_tensor);
dev_info->event_mgr->ThenExecute(
recv_host_to_device_stream,
[recv_host_to_device_stream, done, input_ref]() {
input_ref.Unref();
if (!recv_host_to_device_stream->ok()) {
LOG(FATAL) << "CPU->GPU Memcpy failed";
}
done(Status::OK());
});

if (MergeComputeAndCopyStream()) {
done(Status::OK());
} else {
// Use of cpu_tensor may outlive stack scope, so keep a ref.
TensorReference input_ref(*cpu_tensor);
dev_info->event_mgr->ThenExecute(
recv_host_to_device_stream,
[recv_host_to_device_stream, done, input_ref]() {
input_ref.Unref();
if (!recv_host_to_device_stream->ok()) {
LOG(FATAL) << "CPU->GPU Memcpy failed";
}
done(Status::OK());
});
}
}

Status GPUUtil::Sync(Device* gpu_device) {
Expand Down
4 changes: 4 additions & 0 deletions tensorflow/core/common_runtime/gpu/gpu_util.h
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,10 @@ class GPUUtil {
const Tensor* src_gpu_tensor,
Tensor* dst_gpu_tensor,
StatusCallback done);

// User decide whether use compute stream as copy stream
// by set environment 'MERGE_COMPUTE_COPY_STREAM'
static bool MergeComputeAndCopyStream();
};

} // namespace tensorflow
Expand Down
2 changes: 1 addition & 1 deletion tensorflow/core/common_runtime/graph_view.cc
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ limitations under the License.
namespace tensorflow {

string NodeItem::DebugString() const {
string ret = strings::StrCat("{name:'", kernel->name(), "' id:", node_id);
string ret = strings::StrCat("{name:'", kernel->name(), "' id:", node->id());
if (is_source) {
strings::StrAppend(&ret, " source}");
} else {
Expand Down
3 changes: 1 addition & 2 deletions tensorflow/core/common_runtime/graph_view.h
Original file line number Diff line number Diff line change
Expand Up @@ -58,8 +58,7 @@ struct ControlEdgeInfo {
//
// Each NodeItem is an element of exactly one GraphView.
struct NodeItem {
// The index of this node's item in its GraphView.
int node_id = -1;
const Node* node = nullptr;

// Cached attributes of this node for fast lookup.
bool kernel_is_async : 1; // True iff kernel->AsAsync() != nullptr
Expand Down
2 changes: 1 addition & 1 deletion tensorflow/core/common_runtime/immutable_executor_state.cc
Original file line number Diff line number Diff line change
Expand Up @@ -126,7 +126,7 @@ Status ImmutableExecutorState::Initialize() {
FrameInfo* frame_info = EnsureFrameInfo(frame_name);

NodeItem* item = gview_.node(id);
item->node_id = id;
item->node = n;

item->input_start = frame_info->total_inputs;
frame_info->total_inputs += n->num_inputs();
Expand Down
2 changes: 1 addition & 1 deletion tensorflow/core/common_runtime/immutable_executor_state.h
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,7 @@ class ImmutableExecutorState {

const FrameInfo& get_enter_frame_info(const NodeItem& node_item) const {
DCHECK(node_item.is_enter);
return *enter_frame_info_[node_item.node_id];
return *enter_frame_info_[node_item.node->id()];
}

bool requires_control_flow_support() const { return requires_control_flow_; }
Expand Down
Loading

0 comments on commit 05f76dc

Please sign in to comment.