From 44b9ca22f15ba22c252f3f3f5b793bbf521a7a74 Mon Sep 17 00:00:00 2001 From: Rafal Banas Date: Tue, 29 Oct 2024 15:33:13 +0100 Subject: [PATCH] Remove intermediate TensorLists. Improve performance Signed-off-by: Rafal Banas --- .../experimental/resize_op_impl_cvcuda.h | 83 ++++++++++--------- dali/operators/image/resize/resize_op_impl.h | 8 +- dali/operators/nvcvop/nvcvop.cc | 60 ++++++++++---- dali/operators/nvcvop/nvcvop.h | 23 ++++- 4 files changed, 114 insertions(+), 60 deletions(-) diff --git a/dali/operators/image/resize/experimental/resize_op_impl_cvcuda.h b/dali/operators/image/resize/experimental/resize_op_impl_cvcuda.h index 08e75af6037..aff2f4414af 100644 --- a/dali/operators/image/resize/experimental/resize_op_impl_cvcuda.h +++ b/dali/operators/image/resize/experimental/resize_op_impl_cvcuda.h @@ -23,6 +23,7 @@ #include "dali/kernels/imgproc/resample/params.h" #include "dali/operators/image/resize/resize_op_impl.h" #include "dali/operators/nvcvop/nvcvop.h" +#include "dali/core/nvtx.h" namespace dali { @@ -33,12 +34,13 @@ class ResizeOpImplCvCuda : public ResizeBase::Impl { static_assert(spatial_ndim == 2 || spatial_ndim == 3, "Only 2D and 3D resizing is supported"); - /// Dimensionality of each separate frame. If input contains no channel dimension, one is added static constexpr int frame_ndim = spatial_ndim + 1; void Setup(TensorListShape<> &out_shape, const TensorListShape<> &in_shape, int first_spatial_dim, span params) override { + first_spatial_dim_ = first_spatial_dim; + // Calculate output shape of the input, as supplied (sequences, planar images, etc) GetResizedShape(out_shape, in_shape, params, spatial_ndim, first_spatial_dim); @@ -49,31 +51,40 @@ class ResizeOpImplCvCuda : public ResizeBase::Impl { // effective frames (from videos, channel planes, etc). GetResizedShape(out_shape_, in_shape_, make_cspan(params_), 0); - // Create a map of non-empty samples - SetFrameIdxs(); - // Now that we know how many logical frames there are, calculate batch subdivision. CalculateMinibatchPartition(minibatch_size_); + CalculateSourceSamples(in_shape, first_spatial_dim); + SetupKernel(); } - // Set the frame_idx_ map with indices of samples that are not empty - void SetFrameIdxs() { - frame_idx_.clear(); - frame_idx_.reserve(in_shape_.num_samples()); - for (int i = 0; i < in_shape_.num_samples(); ++i) { - if (volume(out_shape_.tensor_shape_span(i)) != 0 && - volume(in_shape_.tensor_shape_span(i)) != 0) { - frame_idx_.push_back(i); + // Assign each minibatch a range of frames in the original input/output TensorLists + void CalculateSourceSamples(const TensorListShape<> &original_shape, int first_spatial_dim) { + int64_t sample_id = 0; + int64_t frame_offset = 0; + for (auto &mb : minibatches_) { + auto v = original_shape[sample_id].num_elements(); + while (v == 0) { + sample_id++; + v = original_shape[sample_id].num_elements(); + } + mb.sample_offset = sample_id; + mb.frame_offset = frame_offset; + frame_offset = mb.frame_offset + mb.count; + int frames_n = num_frames(original_shape[sample_id], first_spatial_dim); + while (frame_offset >= frames_n) { + frame_offset -= frames_n; + if (++sample_id >= original_shape.num_samples()) { + break; + } + frames_n = num_frames(original_shape[sample_id], first_spatial_dim); } - total_frames_ = frame_idx_.size(); } } - // get the index of a frame in the DALI TensorList - int frame_idx(int f) { - return frame_idx_[f]; + int64_t num_frames(const TensorShape<> &shape, int first_spatial_dim) { + return volume(&shape[0], &shape[first_spatial_dim]); } void SetupKernel() { @@ -88,15 +99,14 @@ class ResizeOpImplCvCuda : public ResizeBase::Impl { int end = mb.start + mb.count; for (int i = mb.start, j = 0; i < end; i++, j++) { - auto f_id = frame_idx(i); - rois_ptr[j] = GetRoi(params_[f_id]); + rois_ptr[j] = GetRoi(params_[i]); for (int d = 0; d < spatial_ndim; ++d) { - mb_input_shapes[j].extent[d] = static_cast(in_shape_.tensor_shape_span(f_id)[d]); + mb_input_shapes[j].extent[d] = static_cast(in_shape_.tensor_shape_span(i)[d]); mb_output_shapes[j].extent[d] = - static_cast(out_shape_.tensor_shape_span(f_id)[d]); + static_cast(out_shape_.tensor_shape_span(i)[d]); } } - int num_channels = in_shape_[frame_idx(0)][frame_ndim - 1]; + int num_channels = in_shape_[0][frame_ndim - 1]; HQResizeTensorShapesI mb_input_shape{mb_input_shapes.data(), mb.count, spatial_ndim, num_channels}; HQResizeTensorShapesI mb_output_shape{mb_output_shapes.data(), mb.count, spatial_ndim, @@ -104,7 +114,7 @@ class ResizeOpImplCvCuda : public ResizeBase::Impl { mb.rois = HQResizeRoisF{mb.count, spatial_ndim, rois_ptr}; rois_ptr += mb.count; - auto param = params_[frame_idx(mb.start)][0]; + auto param = params_[mb.start][0]; mb.min_interpolation = GetInterpolationType(param.min_filter); mb.mag_interpolation = GetInterpolationType(param.mag_filter); mb.antialias = param.min_filter.antialias || param.mag_filter.antialias; @@ -149,12 +159,6 @@ class ResizeOpImplCvCuda : public ResizeBase::Impl { kernels::DynamicScratchpad scratchpad({}, AccessOrder(ws.stream())); auto allocator = nvcvop::GetScratchpadAllocator(scratchpad); - in_frames_.ShareData(input); - in_frames_.Resize(in_shape_); - - out_frames_.ShareData(output); - out_frames_.Resize(out_shape_); - auto workspace_mem = AllocateWorkspaces(scratchpad); for (size_t b = 0; b < minibatches_.size(); b++) { @@ -162,8 +166,10 @@ class ResizeOpImplCvCuda : public ResizeBase::Impl { auto reqs = nvcv::TensorBatch::CalcRequirements(mb.count); auto mb_output = nvcv::TensorBatch(reqs, allocator); auto mb_input = nvcv::TensorBatch(reqs, allocator); - nvcvop::PushTensorsToBatch(mb_input, in_frames_, mb.start, mb.count, sample_layout_); - nvcvop::PushTensorsToBatch(mb_output, out_frames_, mb.start, mb.count, sample_layout_); + nvcvop::PushFramesToBatch(mb_input, input, first_spatial_dim_, mb.sample_offset, + mb.frame_offset, mb.count, sample_layout_); + nvcvop::PushFramesToBatch(mb_output, output, first_spatial_dim_, mb.sample_offset, + mb.frame_offset, mb.count, sample_layout_); resize_op_(ws.stream(), workspace_mem[b % 2], mb_input, mb_output, mb.min_interpolation, mb.mag_interpolation, mb.antialias, mb.rois); } @@ -179,13 +185,14 @@ class ResizeOpImplCvCuda : public ResizeBase::Impl { } void CalculateMinibatchPartition(int minibatch_size) { + total_frames_ = in_shape_.num_samples(); std::vector> continuous_ranges; - kernels::FilterDesc min_filter_desc = params_[frame_idx(0)][0].min_filter; - kernels::FilterDesc mag_filter_desc = params_[frame_idx(0)][0].mag_filter; + kernels::FilterDesc min_filter_desc = params_[0][0].min_filter; + kernels::FilterDesc mag_filter_desc = params_[0][0].mag_filter; int start_id = 0; for (int i = 0; i < total_frames_; i++) { - if (params_[frame_idx(i)][0].min_filter != min_filter_desc || - params_[frame_idx(i)][0].mag_filter != mag_filter_desc) { + if (params_[i][0].min_filter != min_filter_desc || + params_[i][0].mag_filter != mag_filter_desc) { // we break the range if different filter types are used continuous_ranges.emplace_back(start_id, i); start_id = i; @@ -214,9 +221,9 @@ class ResizeOpImplCvCuda : public ResizeBase::Impl { } TensorListShape in_shape_, out_shape_; - std::vector frame_idx_; // map of absolute frame indices in the input TensorList int total_frames_; // number of non-empty frames std::vector> params_; + int first_spatial_dim_; cvcuda::HQResize resize_op_{}; nvcvop::NVCVOpWorkspace op_workspace_; @@ -224,8 +231,8 @@ class ResizeOpImplCvCuda : public ResizeBase::Impl { std::vector rois_; const TensorLayout sample_layout_ = (spatial_ndim == 2) ? "HWC" : "DHWC"; - TensorList in_frames_; - TensorList out_frames_; + std::vector in_frames_; + std::vector out_frames_; struct MiniBatch { int start, count; @@ -233,6 +240,8 @@ class ResizeOpImplCvCuda : public ResizeBase::Impl { NVCVInterpolationType mag_interpolation; bool antialias; HQResizeRoisF rois; + int64_t sample_offset; // id of a starting sample in the original IOs + int64_t frame_offset; // id of a starting frame in the starting sample }; std::vector minibatches_; diff --git a/dali/operators/image/resize/resize_op_impl.h b/dali/operators/image/resize/resize_op_impl.h index 89111cf9038..300f533052f 100644 --- a/dali/operators/image/resize/resize_op_impl.h +++ b/dali/operators/image/resize/resize_op_impl.h @@ -63,7 +63,8 @@ void GetFrameShapesAndParams( for (int i = 0; i < N; i++) { auto in_sample_shape = in_shape.tensor_shape_span(i); - total_frames += volume(&in_sample_shape[0], &in_sample_shape[first_spatial_dim]); + if (volume(in_sample_shape) > 0) + total_frames += volume(&in_sample_shape[0], &in_sample_shape[first_spatial_dim]); } frame_params.resize(total_frames); @@ -72,10 +73,11 @@ void GetFrameShapesAndParams( int ndim = in_shape.sample_dim(); for (int i = 0, flat_frame_idx = 0; i < N; i++) { auto in_sample_shape = in_shape.tensor_shape_span(i); + if (volume(in_sample_shape) == 0) { + continue; // skip empty samples + } // Collapse leading dimensions, if any, as frame dim. This handles channel-first. int seq_len = volume(&in_sample_shape[0], &in_sample_shape[first_spatial_dim]); - if (seq_len == 0) - continue; // skip empty sequences TensorShape frame_shape; frame_shape.resize(frame_ndim); diff --git a/dali/operators/nvcvop/nvcvop.cc b/dali/operators/nvcvop/nvcvop.cc index ae327cbd0a9..a11a4b2a832 100644 --- a/dali/operators/nvcvop/nvcvop.cc +++ b/dali/operators/nvcvop/nvcvop.cc @@ -212,7 +212,7 @@ nvcv::Tensor AsTensor(void *data, const TensorShape<> &shape, DALIDataType daliD TensorLayout layout) { auto dtype = GetDataType(daliDType, 1); nvcv::TensorDataStridedCuda::Buffer inBuf; - inBuf.basePtr = reinterpret_cast(const_cast(data)); + inBuf.basePtr = static_cast(const_cast(data)); inBuf.strides[shape.size() - 1] = dtype.strideBytes(); for (int d = shape.size() - 2; d >= 0; --d) { inBuf.strides[d] = shape[d + 1] * inBuf.strides[d + 1]; @@ -229,7 +229,7 @@ nvcv::Tensor AsTensor(const void *data, span shape_data, const nv const nvcv::TensorLayout &layout) { int ndim = shape_data.size(); nvcv::TensorDataStridedCuda::Buffer inBuf; - inBuf.basePtr = reinterpret_cast(const_cast(data)); + inBuf.basePtr = static_cast(const_cast(data)); inBuf.strides[ndim - 1] = dtype.strideBytes(); for (int d = ndim - 2; d >= 0; --d) { inBuf.strides[d] = shape_data[d + 1] * inBuf.strides[d + 1]; @@ -239,26 +239,54 @@ nvcv::Tensor AsTensor(const void *data, span shape_data, const nv return nvcv::TensorWrapData(inData); } +int64_t calc_num_frames(const TensorShape<> &shape, int first_spatial_dim) { + return (first_spatial_dim > 0) ? + volume(&shape[0], &shape[first_spatial_dim]) : + 1; +} -void PushTensorsToBatch(nvcv::TensorBatch &batch, const TensorList &t_list, - int64_t start, int64_t count, const TensorLayout &layout) { - int ndim = t_list.sample_dim(); - auto dtype = GetDataType(t_list.type(), 1); - TensorLayout out_layout = layout.empty() ? t_list.GetLayout() : layout; - DALI_ENFORCE( - out_layout.empty() || out_layout.size() == ndim, - make_string("Layout ", out_layout, " does not match the number of dimensions: ", ndim)); - auto nvcv_layout = nvcv::TensorLayout(out_layout.c_str()); - std::vector tensors; - tensors.reserve(count); +void PushFramesToBatch(nvcv::TensorBatch &batch, const TensorList &t_list, + int first_spatial_dim, int64_t starting_sample, int64_t frame_offset, + int64_t num_frames, const TensorLayout &layout) { + int ndim = layout.ndim(); + auto nvcv_layout = nvcv::TensorLayout(layout.c_str()); + auto dtype = GetDataType(t_list.type()); - for (int s = 0; s < count; ++s) { - tensors.push_back(AsTensor(t_list.raw_tensor(s + start), t_list.tensor_shape_span(s + start), - dtype, nvcv_layout)); + std::vector tensors; + tensors.reserve(num_frames); + + const auto &input_shape = t_list.shape(); + int64_t sample_id = starting_sample - 1; + auto type_size = dtype.strideBytes(); + std::vector frame_shape(ndim, 1); + + auto frame_stride = 0; + int sample_nframes = 0; + const uint8_t *data = nullptr; + + for (int64_t i = 0; i < num_frames; ++i) { + if (frame_offset == sample_nframes) { + frame_offset = 0; + do { + ++sample_id; + auto sample_shape = input_shape[sample_id]; + DALI_ENFORCE(sample_id < t_list.num_samples()); + std::copy(&sample_shape[first_spatial_dim], &sample_shape[input_shape.sample_dim()], + frame_shape.begin()); + frame_stride = volume(frame_shape) * type_size; + sample_nframes = calc_num_frames(sample_shape, first_spatial_dim); + } while (sample_nframes * frame_stride == 0); // we skip empty samples + data = + static_cast(t_list.raw_tensor(sample_id)) + frame_stride * frame_offset; + } + tensors.push_back(AsTensor(data, make_span(frame_shape), dtype, nvcv_layout)); + data += frame_stride; + frame_offset++; } batch.pushBack(tensors.begin(), tensors.end()); } + cvcuda::Workspace NVCVOpWorkspace::Allocate(const cvcuda::WorkspaceRequirements &reqs, kernels::Scratchpad &scratchpad) { auto *hostBuffer = scratchpad.AllocateHost(reqs.hostMem.size, reqs.hostMem.alignment); diff --git a/dali/operators/nvcvop/nvcvop.h b/dali/operators/nvcvop/nvcvop.h index c4e61161c2d..407f9d0edec 100644 --- a/dali/operators/nvcvop/nvcvop.h +++ b/dali/operators/nvcvop/nvcvop.h @@ -134,12 +134,27 @@ void AllocateImagesLike(nvcv::ImageBatchVarShape &output, const TensorList &t_list); + /** - * @brief Push samples from a given tensor list to a given TensorBatch. - * [start, start+count) determines the range of samples in the TensorList that will be used. + * @brief Push a range of frames from the input TensorList as samples in the output TensorBatch. + * + * The input TensorList is interpreted as sequence of frames where innermost dimensions + * starting from `first_spatial_dim` are the frames' dimensions. + * + * The range of frames is determined by the `starting_sample`, `frame_offset` + * and `num_frames` arguments. + * `starting_sample` is an index of the first source sample from the input TensorList. All the samples before that are skipped. + * `frame_offset` is an index of a first frame in the starting sample to be taken. + * `num_frames` is the total number of frames that will be pushed to the output TensorBatch. + * + * @param batch output TensorBatch + * @param t_list input TensorList + * @param layout layout of the output TensorBatch */ -void PushTensorsToBatch(nvcv::TensorBatch &batch, const TensorList &t_list, - int64_t start, int64_t count, const TensorLayout &layout); +void PushFramesToBatch(nvcv::TensorBatch &batch, const TensorList &t_list, + int first_spatial_dim, int64_t starting_sample, int64_t frame_offset, + int64_t num_frames, const TensorLayout &layout); + class NVCVOpWorkspace { public: