From cd080a6205a3d1b0791fa94f3368daea6b8a5cce Mon Sep 17 00:00:00 2001
From: Janusz Lisiecki <39967756+JanuszL@users.noreply.github.com>
Date: Mon, 30 Sep 2024 16:16:30 +0200
Subject: [PATCH 01/29] Fix conda build for Python 3.9 (#5649)

- adds libprotobuf as a runtime dependency for DALI conda
  build as it makes sure that the right version of
  libasbseil is installed

Signed-off-by: Janusz Lisiecki <jlisiecki@nvidia.com>
---
 conda/dali_native_libs/recipe/meta.yaml     | 5 +++++
 conda/dali_python_bindings/recipe/meta.yaml | 5 +++++
 2 files changed, 10 insertions(+)

diff --git a/conda/dali_native_libs/recipe/meta.yaml b/conda/dali_native_libs/recipe/meta.yaml
index cd10c5d283..85e0d8e2e2 100644
--- a/conda/dali_native_libs/recipe/meta.yaml
+++ b/conda/dali_native_libs/recipe/meta.yaml
@@ -97,6 +97,11 @@ requirements:
     # Since we link statically, we need to add those dependencies explicitly
     - libwebp-base
     - openjpeg
+    # libprotobuf-static we link statically depends on libabseil so add libprotobuf here as a runtime
+    # dependency to install the right version on the libabseil (as protobuf depends on
+    # libprotobuf-static and a newer version of libprotobuf-static may be available than
+    # the protobuf was build with)
+    - libprotobuf =5.27.4
     - cfitsio
     - nvidia-nvimagecodec-cuda{{ environ.get('CUDA_VERSION', '') | replace(".","") }}
 
diff --git a/conda/dali_python_bindings/recipe/meta.yaml b/conda/dali_python_bindings/recipe/meta.yaml
index 3e498e8724..9ccd2dca1b 100644
--- a/conda/dali_python_bindings/recipe/meta.yaml
+++ b/conda/dali_python_bindings/recipe/meta.yaml
@@ -85,6 +85,11 @@ requirements:
     - nvidia-nvimagecodec-cuda{{ environ.get('CUDA_VERSION', '') | replace(".","") }}
   run:
     - python
+    # libprotobuf-static we link statically depends on libabseil so add libprotobuf here as a runtime
+    # dependency to install the right version on the libabseil (as protobuf depends on
+    # libprotobuf-static and a newer version of libprotobuf-static may be available than
+    # the protobuf was build with)
+    - libprotobuf =5.27.4
     - future
     - astunparse >=1.6.0
     - gast >=0.3.3

From e9aebf164f96b5109122d5736a073228c04d1808 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Micha=C5=82=20Zientkiewicz?= <michalz@nvidia.com>
Date: Mon, 30 Sep 2024 19:43:30 +0200
Subject: [PATCH 02/29] Fix issues detected by Coverity (2024.09.30) (#5652)

* Fix issues detected by Coverity (2024.09.30)

Signed-off-by: Michal Zientkiewicz <michalz@nvidia.com>
---
 dali/operators/image/remap/cvcuda/matrix_adjust.cu              | 1 +
 dali/operators/image/remap/cvcuda/warp_perspective.cc           | 2 +-
 .../operators/image/resize/experimental/resize_op_impl_cvcuda.h | 2 +-
 dali/pipeline/executor/executor2/exec2.cc                       | 2 +-
 dali/pipeline/executor/executor2/exec2_ops_for_test.cu          | 1 +
 dali/pipeline/executor/executor2/exec_node_task.cc              | 1 +
 6 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/dali/operators/image/remap/cvcuda/matrix_adjust.cu b/dali/operators/image/remap/cvcuda/matrix_adjust.cu
index 4651c880ce..d2928413a2 100644
--- a/dali/operators/image/remap/cvcuda/matrix_adjust.cu
+++ b/dali/operators/image/remap/cvcuda/matrix_adjust.cu
@@ -55,6 +55,7 @@ void adjustMatrices(nvcv::Tensor &matrices, cudaStream_t stream) {
   int num_blocks = div_ceil(bs, 256);
   int threads_per_block = std::min(bs, 256);
   adjustMatricesKernel2<<<num_blocks, threads_per_block, 0, stream>>>(wrap, bs);
+  CUDA_CALL(cudaGetLastError());
 }
 
 }  // namespace warp_perspective
diff --git a/dali/operators/image/remap/cvcuda/warp_perspective.cc b/dali/operators/image/remap/cvcuda/warp_perspective.cc
index 67a42a9409..1f646e3311 100644
--- a/dali/operators/image/remap/cvcuda/warp_perspective.cc
+++ b/dali/operators/image/remap/cvcuda/warp_perspective.cc
@@ -231,7 +231,7 @@ class WarpPerspective : public nvcvop::NVCVSequenceOperator<StatelessOperator> {
   NVCVBorderType border_mode_ = NVCV_BORDER_CONSTANT;
   NVCVInterpolationType interp_type_ = NVCV_INTERP_NEAREST;
   std::vector<float> fill_value_arg_{0, 0, 0, 0};
-  float4 fill_value_;
+  float4 fill_value_{};
   bool inverse_map_ = false;
   bool ocv_pixel_ = true;
   std::optional<cvcuda::WarpPerspective> warp_perspective_;
diff --git a/dali/operators/image/resize/experimental/resize_op_impl_cvcuda.h b/dali/operators/image/resize/experimental/resize_op_impl_cvcuda.h
index 285365fd2b..c9a7cbd324 100644
--- a/dali/operators/image/resize/experimental/resize_op_impl_cvcuda.h
+++ b/dali/operators/image/resize/experimental/resize_op_impl_cvcuda.h
@@ -205,7 +205,7 @@ class ResizeOpImplCvCuda : public ResizeBase<GPUBackend>::Impl {
 
   TensorListShape<frame_ndim> in_shape_, out_shape_;
   std::vector<int> frame_idx_;  // map of absolute frame indices in the input TensorList
-  int total_frames_;  // number of non-empty frames
+  int total_frames_ = 0;  // number of non-empty frames
   std::vector<ResamplingParamsND<spatial_ndim>> params_;
 
   cvcuda::HQResize resize_op_{};
diff --git a/dali/pipeline/executor/executor2/exec2.cc b/dali/pipeline/executor/executor2/exec2.cc
index c3fe5df5ee..bb989a2163 100644
--- a/dali/pipeline/executor/executor2/exec2.cc
+++ b/dali/pipeline/executor/executor2/exec2.cc
@@ -347,7 +347,7 @@ class Executor2::Impl {
       auto stream_idx = assignment[&node];
 
       node.env.order = stream_idx.has_value()
-                     ? AccessOrder(streams_[*stream_idx])
+                     ? AccessOrder(streams_[*stream_idx].get())
                      : AccessOrder::host();
     }
   }
diff --git a/dali/pipeline/executor/executor2/exec2_ops_for_test.cu b/dali/pipeline/executor/executor2/exec2_ops_for_test.cu
index e91fb186c0..65a327f753 100644
--- a/dali/pipeline/executor/executor2/exec2_ops_for_test.cu
+++ b/dali/pipeline/executor/executor2/exec2_ops_for_test.cu
@@ -99,6 +99,7 @@ void DummyOpGPU::RunImpl(Workspace &ws) {
     scratch.ToGPU(ws.stream(), pointers),
     ws.NumInput() + 1,
     N);
+  CUDA_CALL(cudaGetLastError());
 }
 
 
diff --git a/dali/pipeline/executor/executor2/exec_node_task.cc b/dali/pipeline/executor/executor2/exec_node_task.cc
index 36360ce47b..99ef627bce 100644
--- a/dali/pipeline/executor/executor2/exec_node_task.cc
+++ b/dali/pipeline/executor/executor2/exec_node_task.cc
@@ -259,6 +259,7 @@ void OpTask::RunOp() {
     ResetInputLayouts();
     PropagateSourceInfo(*ws_);
   }
+  assert(ws_->GetIterationData());
   if (auto cpt = ws_->GetIterationData()->checkpoint) {
     node_->op->SaveState(cpt->GetOpCheckpoint(node_->instance_name), ws_->output_order());
   }

From 93ce1648eb2efe30e237a8589e4c66224f5d82c4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Micha=C5=82=20Zientkiewicz?= <michalz@nvidia.com>
Date: Tue, 1 Oct 2024 11:01:29 +0200
Subject: [PATCH 03/29] Add DataNode.shape() (#5648)

* Add DataNode.shape()
* Add an early check for non-dynamic executor in DataNode.cpu() and shape().

---------

Signed-off-by: Michal Zientkiewicz <michalz@nvidia.com>
---
 dali/python/nvidia/dali/data_node.py | 27 +++++++++++++++++++++++++++
 dali/test/python/test_pipeline.py    | 20 +++++++++++++++-----
 2 files changed, 42 insertions(+), 5 deletions(-)

diff --git a/dali/python/nvidia/dali/data_node.py b/dali/python/nvidia/dali/data_node.py
index ebd8312dbb..3892a1ce5c 100644
--- a/dali/python/nvidia/dali/data_node.py
+++ b/dali/python/nvidia/dali/data_node.py
@@ -81,6 +81,7 @@ def gpu(self) -> DataNode:
         return self._to_backend("gpu")
 
     def cpu(self) -> DataNode:
+        self._check_gpu2cpu()
         return self._to_backend("cpu")
 
     # Note: Regardless of whether we want the cpu or gpu version
@@ -259,6 +260,32 @@ def process_index(idx, dim):
         else:
             return nvidia.dali.fn.expand_dims(sliced, axes=new_axes, new_axis_names=new_axis_names)
 
+    def shape(self, *, dtype=None, device="cpu"):
+        """Returns the run-time shapes of this DataNode as a new DataNode
+
+        Parameters
+        ----------
+        arg_dtype : DALIDataType, optional
+            If specified, the shape will be converted to this data type; defaults to INT64.
+        device : str, optional
+            The device ("cpu" or "gpu") where the result is returned; defaults to CPU.
+        """
+        from . import fn
+
+        if device == "cpu":
+            self._check_gpu2cpu()
+        return fn.shapes(self, dtype=dtype, device=device)
+
+    def _check_gpu2cpu(self):
+        if self.device == "gpu" and self.source and self.source.pipeline:
+            if not self.source.pipeline._exec_dynamic:
+                raise RuntimeError(
+                    "This pipeline doesn't support transition from GPU to CPU.\n"
+                    'To enable GPU->CPU transitions, use the experimental "dynamic" executor.\n'
+                    "Specify experimental_exec_dynamic=True in your Pipeline constructor or "
+                    "@pipeline_def."
+                )
+
 
 not_iterable(DataNode)
 
diff --git a/dali/test/python/test_pipeline.py b/dali/test/python/test_pipeline.py
index 0a6456666e..a88473a1d9 100644
--- a/dali/test/python/test_pipeline.py
+++ b/dali/test/python/test_pipeline.py
@@ -2257,18 +2257,24 @@ def pdef():
         enc, _ = fn.readers.file(file_root=jpeg_folder)
         img = fn.decoders.image(enc, device="mixed")
         peek = fn.peek_image_shape(enc)
-        return peek, fn.shapes(img, device="cpu"), fn.shapes(img.cpu())
+        shapes_of_gpu = fn.shapes(img, device="cpu")
+        shapes_of_cpu = fn.shapes(img.cpu())
+        return peek, shapes_of_gpu, shapes_of_cpu, img.shape(), img.cpu().shape()
 
     pipe = pdef()
     pipe.build()
     for i in range(10):
-        peek, shape_of_gpu, shape_of_cpu = pipe.run()
+        peek, shape_of_gpu, shape_of_cpu, shape_func_gpu, shape_func_cpu = pipe.run()
         # all results must be CPU tensor lists
         assert isinstance(peek, dali.backend_impl.TensorListCPU)
         assert isinstance(shape_of_gpu, dali.backend_impl.TensorListCPU)
         assert isinstance(shape_of_cpu, dali.backend_impl.TensorListCPU)
+        assert isinstance(shape_func_gpu, dali.backend_impl.TensorListCPU)
+        assert isinstance(shape_func_cpu, dali.backend_impl.TensorListCPU)
         check_batch(shape_of_gpu, peek, bs, 0, 0)
         check_batch(shape_of_cpu, peek, bs, 0, 0)
+        check_batch(shape_func_gpu, peek, bs, 0, 0)
+        check_batch(shape_func_cpu, peek, bs, 0, 0)
 
 
 def test_gpu2cpu_old_exec_error():
@@ -2282,11 +2288,15 @@ def test_gpu2cpu_old_exec_error():
         exec_pipelined=False,
         experimental_exec_dynamic=False,
     )
-    def pdef():
+    def pdef(to_cpu):
         gpu = fn.external_source("input", device="gpu")
-        return gpu.cpu()
+        return to_cpu(gpu)
+
+    with assert_raises(RuntimeError, glob="doesn't support transition from GPU to CPU"):
+        _ = pdef(lambda gpu: gpu.cpu())  # this will raise an error at construction time
+
+    pipe = pdef(lambda gpu: gpu._to_backend("cpu"))  # this will not raise errors until build-time
 
-    pipe = pdef()
     with assert_raises(RuntimeError, glob="doesn't support transition from GPU to CPU"):
         pipe.build()
 

From 154dad0e83a380c3705213e163f256f853f9023c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Micha=C5=82=20Zientkiewicz?= <michalz@nvidia.com>
Date: Tue, 1 Oct 2024 19:31:27 +0200
Subject: [PATCH 04/29] Remove and forbid direct inclusion of half.hpp. (#5654)

Signed-off-by: Michal Zientkiewicz <michalz@nvidia.com>
---
 .../kernels/slice/slice_flip_normalize_permute_pad_cpu.h | 1 -
 dali/operators/image/crop/crop.cc                        | 1 -
 dali/operators/image/crop/crop_mirror_normalize.cc       | 1 -
 dali/pipeline/data/types.cc                              | 3 +--
 dali/python/backend_impl.cc                              | 1 -
 include/dali/core/float16.h                              | 7 +++++--
 include/dali/util/half.hpp                               | 9 ++++-----
 7 files changed, 10 insertions(+), 13 deletions(-)

diff --git a/dali/kernels/slice/slice_flip_normalize_permute_pad_cpu.h b/dali/kernels/slice/slice_flip_normalize_permute_pad_cpu.h
index 8580a62c0c..12efb34938 100644
--- a/dali/kernels/slice/slice_flip_normalize_permute_pad_cpu.h
+++ b/dali/kernels/slice/slice_flip_normalize_permute_pad_cpu.h
@@ -26,7 +26,6 @@
 #include "dali/kernels/kernel.h"
 #include "dali/kernels/slice/slice_flip_normalize_permute_pad_common.h"
 #include "dali/kernels/slice/slice_kernel_utils.h"
-#include "dali/util/half.hpp"
 
 namespace dali {
 namespace kernels {
diff --git a/dali/operators/image/crop/crop.cc b/dali/operators/image/crop/crop.cc
index 97c4c5ca5b..b112300628 100644
--- a/dali/operators/image/crop/crop.cc
+++ b/dali/operators/image/crop/crop.cc
@@ -18,7 +18,6 @@
 #include "dali/core/static_switch.h"
 #include "dali/kernels/slice/slice_cpu.h"
 #include "dali/pipeline/data/views.h"
-#include "dali/util/half.hpp"
 
 namespace dali {
 
diff --git a/dali/operators/image/crop/crop_mirror_normalize.cc b/dali/operators/image/crop/crop_mirror_normalize.cc
index 8672bcaf3e..4197c8f0d4 100644
--- a/dali/operators/image/crop/crop_mirror_normalize.cc
+++ b/dali/operators/image/crop/crop_mirror_normalize.cc
@@ -17,7 +17,6 @@
 #include "dali/core/tensor_layout.h"
 #include "dali/kernels/slice/slice_flip_normalize_permute_pad_cpu.h"
 #include "dali/pipeline/data/views.h"
-#include "dali/util/half.hpp"
 
 namespace dali {
 
diff --git a/dali/pipeline/data/types.cc b/dali/pipeline/data/types.cc
index 51fd207d2a..079534c205 100644
--- a/dali/pipeline/data/types.cc
+++ b/dali/pipeline/data/types.cc
@@ -27,8 +27,7 @@
 const auto &_type_info_##Id = TypeTable::GetTypeId<Type>()
 
 #include "dali/pipeline/data/types.h"
-#include "dali/util/half.hpp"
-
+#include "dali/core/float16.h"
 #include "dali/pipeline/data/backend.h"
 #include "dali/core/per_stream_pool.h"
 #include "dali/kernels/common/scatter_gather.h"
diff --git a/dali/python/backend_impl.cc b/dali/python/backend_impl.cc
index c4eabb2d06..b9f43e8004 100644
--- a/dali/python/backend_impl.cc
+++ b/dali/python/backend_impl.cc
@@ -44,7 +44,6 @@
 #include "dali/pipeline/pipeline_debug.h"
 #include "dali/plugin/plugin_manager.h"
 #include "dali/python/python3_compat.h"
-#include "dali/util/half.hpp"
 #include "dali/util/pybind.h"
 #include "dali/util/user_stream.h"
 
diff --git a/include/dali/core/float16.h b/include/dali/core/float16.h
index e43cd09ee1..06fd5c4b9b 100644
--- a/include/dali/core/float16.h
+++ b/include/dali/core/float16.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2019-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// Copyright (c) 2019-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -21,9 +21,12 @@
 #include <type_traits>
 #include "dali/core/host_dev.h"
 #include "dali/core/force_inline.h"
+
 #ifndef __CUDA_ARCH__
+#define DALI_CORE_FLOAT16_H_INTERNAL
 #include "dali/util/half.hpp"
-#endif
+#undef DALI_CORE_FLOAT16_H_INTERNAL
+#endif  // __CUDA_ARCH__
 
 namespace dali {
 
diff --git a/include/dali/util/half.hpp b/include/dali/util/half.hpp
index fb5db617f2..fb6f505a11 100644
--- a/include/dali/util/half.hpp
+++ b/include/dali/util/half.hpp
@@ -22,6 +22,10 @@
 #ifndef HALF_HALF_HPP
 #define HALF_HALF_HPP
 
+#ifndef DALI_CORE_FLOAT16_H_INTERNAL
+#error This header is not meant for direct inclusion. Include dali/core/float16.h instead.
+#endif  // DALI_CORE_FLOAT16_H_INTERNAL
+
 #include <math.h>
 #include <stdio.h>
 
@@ -200,11 +204,6 @@
 	#include <functional>
 #endif
 
-#ifdef __CUDA_ARCH__
-  #include "caffe/util/half.cuh"
-  #include "caffe/util/gpu_math_functions.cuh"
-#endif
-
 #if !defined(CPU_ONLY) && defined(__CUDA_ARCH__)
   #define CAFFE_UTIL_HD __host__ __device__
   #define CAFFE_UTIL_IHD __inline__ __host__ __device__

From 0a980760083f4cd96cd232a71be381d9887a9eca Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Micha=C5=82=20Zientkiewicz?= <michalz@nvidia.com>
Date: Tue, 1 Oct 2024 20:05:02 +0200
Subject: [PATCH 05/29] GetProperty refactor + DataNode.property accessor
 (#5650)

* Refactor get_property. Add  and  functions to DataNode.
* Add documentation to _check_gpu2cpu.

---------

Signed-off-by: Michal Zientkiewicz <michalz@nvidia.com>
---
 dali/operators/util/get_property.cc           | 107 +++++++++++++++++-
 dali/operators/util/get_property.h            |  36 ++----
 dali/operators/util/property.cc               |  64 -----------
 dali/operators/util/property.h                |  99 ----------------
 dali/python/nvidia/dali/data_node.py          |  36 ++++++
 .../python/operator_1/test_get_property.py    |  37 ++++--
 6 files changed, 179 insertions(+), 200 deletions(-)
 delete mode 100644 dali/operators/util/property.cc
 delete mode 100644 dali/operators/util/property.h

diff --git a/dali/operators/util/get_property.cc b/dali/operators/util/get_property.cc
index 409f6f86e4..f6abb90cf7 100644
--- a/dali/operators/util/get_property.cc
+++ b/dali/operators/util/get_property.cc
@@ -1,4 +1,4 @@
-// Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// Copyright (c) 2021-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -23,6 +23,7 @@ DALI_SCHEMA(GetProperty)
 
 The type of the output will depend on the ``key`` of the requested property.)code")
     .NumInput(1)
+    .InputDevice(0, InputDevice::Metadata)
     .NumOutput(1)
     .AddArg("key",
             R"code(Specifies, which property is requested.
@@ -38,6 +39,110 @@ The following properties are supported:
 )code",
             DALI_STRING);
 
+template <typename Backend, typename SampleShapeFunc, typename CopySampleFunc>
+void GetPerSample(TensorList<CPUBackend> &out, const TensorList<Backend> &in,
+                  SampleShapeFunc &&sample_shape, CopySampleFunc &&copy_sample) {
+  int N = in.num_samples();
+  TensorListShape<> tls;
+  for (int i = 0; i < N; i++) {
+    auto shape = sample_shape(in, i);
+    if (i == 0)
+      tls.resize(N, shape.sample_dim());
+    tls.set_tensor_shape(i, shape);
+  }
+  out.Resize(tls, DALI_UINT8);
+  for (int i = 0; i < N; i++) {
+    copy_sample(out, in, i);
+  }
+}
+
+template <typename Backend>
+void SourceInfoToTL(TensorList<CPUBackend> &out, const TensorList<Backend> &in) {
+  GetPerSample(out, in,
+    [](auto &in, int idx) {
+      auto &info = in.GetMeta(idx).GetSourceInfo();
+      return TensorShape<1>(info.length());
+    },
+    [](auto &out, auto &in, int idx) {
+      auto &info = in.GetMeta(idx).GetSourceInfo();
+      std::memcpy(out.raw_mutable_tensor(idx), info.c_str(), info.length());
+    });
+}
+
+template <typename Backend>
+void SourceInfoToTL(TensorList<GPUBackend> &out, const TensorList<Backend> &in) {
+  TensorList<CPUBackend> tmp;
+  tmp.set_pinned(true);
+  SourceInfoToTL(tmp, in);
+  tmp.set_order(out.order());
+  out.Copy(tmp);
+}
+
+template <typename OutputBackend>
+void SourceInfoToTL(TensorList<OutputBackend> &out, const Workspace &ws) {
+  ws.Output<OutputBackend>(0).set_order(ws.output_order());
+  if (ws.InputIsType<CPUBackend>(0))
+    return SourceInfoToTL(out, ws.Input<CPUBackend>(0));
+  else if (ws.InputIsType<GPUBackend>(0))
+    return SourceInfoToTL(out, ws.Input<GPUBackend>(0));
+  else
+    DALI_FAIL("Internal error - input 0 is neither CPU nor GPU.");
+}
+
+template <typename Backend>
+void RepeatTensor(TensorList<Backend> &tl, const Tensor<Backend> &t, int N) {
+  tl.Reset();
+  tl.set_device_id(t.device_id());
+  tl.SetSize(N);
+  tl.set_sample_dim(t.ndim());
+  tl.set_type(t.type());
+  tl.SetLayout(t.GetLayout());
+  for (int i = 0; i < N; i++)
+    tl.SetSample(i, t);
+}
+
+template <typename Backend>
+void RepeatFirstSample(TensorList<Backend> &tl, int N) {
+  Tensor<Backend> t;
+  TensorShape<> shape = tl[0].shape();
+  t.ShareData(unsafe_sample_owner(tl, 0), shape.num_elements(), tl.is_pinned(),
+              shape, tl.type(), tl.device_id(), tl.order());
+  t.SetMeta(tl.GetMeta(0));
+  RepeatTensor(tl, t, N);
+}
+
+void LayoutToTL(TensorList<CPUBackend> &out, const Workspace &ws) {
+  TensorLayout l = ws.GetInputLayout(0);
+  out.Resize(uniform_list_shape(1, { l.size() }), DALI_UINT8);
+  memcpy(out.raw_mutable_tensor(0), l.data(), l.size());
+  RepeatFirstSample(out, ws.GetInputBatchSize(0));
+}
+
+void LayoutToTL(TensorList<GPUBackend> &out, const Workspace &ws) {
+  TensorLayout l = ws.GetInputLayout(0);
+  Tensor<CPUBackend> tmp_cpu;
+  Tensor<GPUBackend> tmp_gpu;
+  tmp_cpu.Resize(TensorShape<1>(l.size()), DALI_UINT8);
+  memcpy(tmp_cpu.raw_mutable_data(), l.data(), l.size());
+  tmp_cpu.set_order(ws.output_order());
+  tmp_gpu.set_order(ws.output_order());
+  tmp_gpu.Copy(tmp_cpu);
+
+  RepeatTensor(out, tmp_gpu, ws.GetInputBatchSize(0));
+}
+
+template <typename Backend>
+auto GetProperty<Backend>::GetPropertyReader(std::string_view key) -> PropertyReader {
+  if (key == "source_info") {
+    return static_cast<PropertyReaderFunc &>(SourceInfoToTL<Backend>);
+  } else if (key == "layout") {
+    return static_cast<PropertyReaderFunc &>(LayoutToTL);
+  } else {
+    DALI_FAIL(make_string("Unsupported property key: ", key));
+  }
+}
+
+
 DALI_REGISTER_OPERATOR(GetProperty, GetProperty<CPUBackend>, CPU)
 DALI_REGISTER_OPERATOR(GetProperty, GetProperty<GPUBackend>, GPU)
 
diff --git a/dali/operators/util/get_property.h b/dali/operators/util/get_property.h
index 02ff7c1bd5..59c0a03ef1 100644
--- a/dali/operators/util/get_property.h
+++ b/dali/operators/util/get_property.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// Copyright (c) 2021-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -17,8 +17,8 @@
 
 #include <memory>
 #include <string>
+#include <string_view>
 #include <vector>
-#include "dali/operators/util/property.h"
 #include "dali/pipeline/data/type_traits.h"
 #include "dali/pipeline/operator/common.h"
 #include "dali/pipeline/operator/checkpointing/stateless_operator.h"
@@ -32,41 +32,29 @@ class GetProperty : public StatelessOperator<Backend> {
   explicit GetProperty(const OpSpec &spec)
       : StatelessOperator<Backend>(spec),
         property_key_(spec.template GetArgument<std::string>("key")),
-        property_(PropertyFactory()) {}
-
-  ~GetProperty() override = default;
-  DISABLE_COPY_MOVE_ASSIGN(GetProperty);
+        property_reader_(GetPropertyReader(property_key_)) {}
 
  protected:
   bool CanInferOutputs() const override {
-    return true;
+    return false;  // we may broadcast a common value to all samples
   }
 
   bool SetupImpl(std::vector<OutputDesc> &output_desc, const Workspace &ws) override {
-    const auto &input = ws.Input<Backend>(0);
-    output_desc.resize(1);
-    output_desc[0].shape = property_->GetShape(input);
-    output_desc[0].type = property_->GetType(input);
-    return true;
+    return false;
   }
 
   void RunImpl(Workspace &ws) override {
-    property_->FillOutput(ws);
+    property_reader_(ws.Output<Backend>(0), ws);
   }
 
  private:
-  std::unique_ptr<tensor_property::Property<Backend>> PropertyFactory() {
-    if (property_key_ == "source_info") {
-      return std::make_unique<tensor_property::SourceInfo<Backend>>();
-    } else if (property_key_ == "layout") {
-      return std::make_unique<tensor_property::Layout<Backend>>();
-    } else {
-      DALI_FAIL(make_string("Unknown property key: ", property_key_));
-    }
-  }
+  using PropertyReaderFunc = void(TensorList<Backend> &, const Workspace &);
+  using PropertyReader = std::function<PropertyReaderFunc>;
+
+  std::string property_key_;
+  PropertyReader property_reader_;
 
-  const std::string property_key_;
-  std::unique_ptr<tensor_property::Property<Backend>> property_;
+  static PropertyReader GetPropertyReader(std::string_view key);
 };
 
 }  // namespace dali
diff --git a/dali/operators/util/property.cc b/dali/operators/util/property.cc
deleted file mode 100644
index eb62213cd9..0000000000
--- a/dali/operators/util/property.cc
+++ /dev/null
@@ -1,64 +0,0 @@
-// Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "dali/operators/util/property.h"
-#include "dali/pipeline/data/backend.h"
-
-namespace dali {
-namespace tensor_property {
-
-template <>
-void SourceInfo<CPUBackend>::FillOutput(Workspace &ws) {
-  const auto& input = ws.Input<CPUBackend>(0);
-  auto& output = ws.Output<CPUBackend>(0);
-  for (int sample_id = 0; sample_id < input.num_samples(); sample_id++) {
-    auto si = GetSourceInfo(input, sample_id);
-    std::memcpy(output.mutable_tensor<uint8_t>(sample_id), si.c_str(), si.length());
-  }
-}
-
-template <>
-void Layout<CPUBackend>::FillOutput(Workspace &ws) {
-  const auto& input = ws.Input<CPUBackend>(0);
-  auto& output = ws.Output<CPUBackend>(0);
-  for (int sample_id = 0; sample_id < input.num_samples(); sample_id++) {
-    auto layout = GetLayout(input, sample_id);
-    std::memcpy(output.mutable_tensor<uint8_t>(sample_id), layout.c_str(), layout.size());
-  }
-}
-
-template <>
-void SourceInfo<GPUBackend>::FillOutput(Workspace &ws) {
-  const auto& input = ws.Input<GPUBackend>(0);
-  auto& output = ws.Output<GPUBackend>(0);
-  for (int sample_id = 0; sample_id < input.num_samples(); sample_id++) {
-    auto si = GetSourceInfo(input, sample_id);
-    auto output_ptr = output.raw_mutable_tensor(sample_id);
-    cudaMemcpyAsync(output_ptr, si.c_str(), si.length(), cudaMemcpyDefault, ws.stream());
-  }
-}
-
-template <>
-void Layout<GPUBackend>::FillOutput(Workspace &ws) {
-  const auto& input = ws.Input<GPUBackend>(0);
-  auto& output = ws.Output<GPUBackend>(0);
-  for (int sample_id = 0; sample_id < input.num_samples(); sample_id++) {
-    auto layout = GetLayout(input, sample_id);
-    auto output_ptr = output.raw_mutable_tensor(sample_id);
-    cudaMemcpyAsync(output_ptr, layout.c_str(), layout.size(), cudaMemcpyDefault, ws.stream());
-  }
-}
-
-}  // namespace tensor_property
-}  // namespace dali
diff --git a/dali/operators/util/property.h b/dali/operators/util/property.h
deleted file mode 100644
index 7c46a7b50c..0000000000
--- a/dali/operators/util/property.h
+++ /dev/null
@@ -1,99 +0,0 @@
-// Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef DALI_OPERATORS_UTIL_PROPERTY_H_
-#define DALI_OPERATORS_UTIL_PROPERTY_H_
-
-#include <string>
-#include "dali/pipeline/data/type_traits.h"
-#include "dali/pipeline/operator/common.h"
-#include "dali/pipeline/operator/operator.h"
-
-namespace dali {
-namespace tensor_property {
-
-/**
- * Base class for a property of the Tensor.
- * @tparam Backend Backend of the operator.
- */
-template <typename Backend>
-struct Property {
-  Property() = default;
-  virtual ~Property() = default;
-
-  /**
-   * @return The shape of the tensor containing the property, based on the input to the operator.
-   */
-  virtual TensorListShape<> GetShape(const TensorList<Backend>& input) = 0;
-
-  /**
-   * @return The type of the tensor containing the property, based on the input to the operator.
-   */
-  virtual DALIDataType GetType(const TensorList<Backend>& input) = 0;
-
-  /**
-   * This function implements filling the output of the operator. Its implementation should
-   * be similar to any RunImpl function of the operator.
-   */
-  virtual void FillOutput(Workspace&) = 0;
-};
-
-
-template <typename Backend>
-struct SourceInfo : public Property<Backend> {
-  TensorListShape<> GetShape(const TensorList<Backend>& input) override {
-    TensorListShape<> ret{static_cast<int>(input.num_samples()), 1};
-    for (int i = 0; i < ret.size(); i++) {
-      ret.set_tensor_shape(i, {static_cast<int64_t>(GetSourceInfo(input, i).length())});
-    }
-    return ret;
-  }
-
-  DALIDataType GetType(const TensorList<Backend>&) override {
-    return DALI_UINT8;
-  }
-
-  void FillOutput(Workspace &ws) override;
-
- private:
-  const std::string& GetSourceInfo(const TensorList<Backend>& input, size_t idx) {
-    return input.GetMeta(idx).GetSourceInfo();
-  }
-};
-
-
-template <typename Backend>
-struct Layout : public Property<Backend> {
-  TensorListShape<> GetShape(const TensorList<Backend>& input) override {
-    // Every tensor in the output has the same number of dimensions
-    return uniform_list_shape(input.num_samples(), {GetLayout(input, 0).size()});
-  }
-
-  DALIDataType GetType(const TensorList<Backend>&) override {
-    return DALI_UINT8;
-  }
-
-  void FillOutput(Workspace &ws) override;
-
- private:
-  const TensorLayout& GetLayout(const TensorList<Backend>& input, int idx) {
-    return input.GetMeta(idx).GetLayout();
-  }
-};
-
-
-}  // namespace tensor_property
-}  // namespace dali
-
-#endif  // DALI_OPERATORS_UTIL_PROPERTY_H_
diff --git a/dali/python/nvidia/dali/data_node.py b/dali/python/nvidia/dali/data_node.py
index 3892a1ce5c..c9dc1a5532 100644
--- a/dali/python/nvidia/dali/data_node.py
+++ b/dali/python/nvidia/dali/data_node.py
@@ -276,7 +276,43 @@ def shape(self, *, dtype=None, device="cpu"):
             self._check_gpu2cpu()
         return fn.shapes(self, dtype=dtype, device=device)
 
+    def property(self, key, *, device="cpu"):
+        """Returns a metadata property associated with a DataNode
+
+        Parameters
+        ----------
+        key : str
+            The name of the metadata item. Currently supported:
+            "source_info"   - the file name or location in the dataset where the data originated
+                              (each sample is a 1D uint8 tensor)
+            "layout"        - the layout string
+                              (each sample is a 1D uint8 tensor)
+        device : str, optional
+            The device, where the value is returned; defaults to CPU.
+        """
+
+        from . import fn
+
+        if device == "cpu":
+            self._check_gpu2cpu()
+
+        return fn.get_property(self, key=key, device=device)
+
+    def source_info(self, *, device="cpu"):
+        """Returns the "source_info" property. Equivalent to self.meta("source_info")."""
+        return self.property("source_info", device=device)
+
     def _check_gpu2cpu(self):
+        """Checks whether using this `DataNode` in a CPU operator is legal.
+
+        The function checks whether it's legal to pass it as an input to a CPU operator.
+        If the node is a result of a GPU operator which belongs to a pipeline with non-dynamic
+        executor, an error is raised.
+
+        .. note::
+        If the defining operator does not yet belong to any pipeline, the error is not raised and
+        the check is deferred until `Pipeline.build`.
+        """
         if self.device == "gpu" and self.source and self.source.pipeline:
             if not self.source.pipeline._exec_dynamic:
                 raise RuntimeError(
diff --git a/dali/test/python/operator_1/test_get_property.py b/dali/test/python/operator_1/test_get_property.py
index 2be70a8379..da718300ca 100644
--- a/dali/test/python/operator_1/test_get_property.py
+++ b/dali/test/python/operator_1/test_get_property.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2021-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -15,7 +15,8 @@
 from nvidia.dali import pipeline_def
 import os
 import numpy as np
-from nvidia.dali import fn
+import nvidia.dali as dali
+import nvidia.dali.fn as fn
 from test_utils import get_dali_extra_path
 from nose_utils import raises
 import tempfile
@@ -52,12 +53,12 @@ def test_file_properties():
         yield _test_file_properties, dev
 
 
-@pipeline_def
-def wds_properties(root_path, device, idx_paths):
+@pipeline_def(experimental_exec_dynamic=True)
+def wds_source_info(root_path, device, idx_paths):
     read = fn.readers.webdataset(paths=[root_path], index_paths=idx_paths, ext=["jpg"])
     if device == "gpu":
         read = read.gpu()
-    return fn.get_property(read, key="source_info")
+    return read.source_info()
 
 
 def generate_wds_index(root_path, index_path):
@@ -67,7 +68,7 @@ def generate_wds_index(root_path, index_path):
         ic.create_index()
 
 
-def _test_wds_properties(device, generate_index):
+def _test_wds_source_info(device, generate_index):
     root_path = os.path.join(get_dali_extra_path(), "db/webdataset/MNIST/devel-0.tar")
     ref_filenames = [
         "2000.jpg",
@@ -84,25 +85,24 @@ def _test_wds_properties(device, generate_index):
         with tempfile.TemporaryDirectory() as idx_dir:
             index_paths = [os.path.join(idx_dir, os.path.basename(root_path) + ".idx")]
             generate_wds_index(root_path, index_paths[0])
-            p = wds_properties(
+            p = wds_source_info(
                 root_path, device, index_paths, batch_size=8, num_threads=4, device_id=0
             )
             p.build()
             output = p.run()
     else:
-        p = wds_properties(root_path, device, None, batch_size=8, num_threads=4, device_id=0)
+        p = wds_source_info(root_path, device, None, batch_size=8, num_threads=4, device_id=0)
         p.build()
         output = p.run()
     for out in output:
-        out = out if device == "cpu" else out.as_cpu()
         for source_info, ref_fname, ref_idx in zip(out, ref_filenames, ref_indices):
             assert _uint8_tensor_to_string(source_info) == f"{root_path}:{ref_idx}:{ref_fname}"
 
 
-def test_wds_properties():
+def test_wds_source_info():
     for dev in ["cpu", "gpu"]:
         for gen_idx in [True, False]:
-            yield _test_wds_properties, dev, gen_idx
+            yield _test_wds_source_info, dev, gen_idx
 
 
 @pipeline_def
@@ -180,7 +180,7 @@ def improper_property(root_path, device):
     return fn.get_property(read, key=["this key doesn't exist"])
 
 
-@raises(RuntimeError, glob="Unknown property key*")
+@raises(RuntimeError, glob="Unsupported property key*")
 def _test_improper_property(device):
     root_path = os.path.join(get_dali_extra_path(), "db/webdataset/MNIST/devel-0.tar")
     p = improper_property(root_path, device, batch_size=8, num_threads=4, device_id=0)
@@ -191,3 +191,16 @@ def _test_improper_property(device):
 def test_improper_property():
     for dev in ["cpu", "gpu"]:
         yield _test_improper_property, dev
+
+
+def test_get_property_gpu2cpu():
+    @pipeline_def(batch_size=2, device_id=0, num_threads=1, experimental_exec_dynamic=True)
+    def test_pipe():
+        data = dali.types.Constant(np.array([[[42]]]), device="gpu", layout="abc")
+        return fn.get_property(data, key="layout", device="cpu")
+
+    pipe = test_pipe()
+    pipe.build()
+    (out,) = pipe.run()
+    assert _uint8_tensor_to_string(out[0]) == "abc"
+    assert _uint8_tensor_to_string(out[1]) == "abc"

From 0e54fe1adf9de45d260cefc177c729eb61e94b3c Mon Sep 17 00:00:00 2001
From: Janusz Lisiecki <39967756+JanuszL@users.noreply.github.com>
Date: Fri, 4 Oct 2024 11:58:12 +0200
Subject: [PATCH 06/29] Increase number of the decoder bench iterations (#5655)

- increases the number of decoder bench iterations to
  make the test more stable

Signed-off-by: Janusz Lisiecki <jlisiecki@nvidia.com>
---
 qa/TL1_decoder_perf/test.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/qa/TL1_decoder_perf/test.sh b/qa/TL1_decoder_perf/test.sh
index 4e5dcada62..969dbf3a4e 100644
--- a/qa/TL1_decoder_perf/test.sh
+++ b/qa/TL1_decoder_perf/test.sh
@@ -14,11 +14,11 @@ test_body() {
   if [ "$(uname -p)" == "x86_64" ]; then
     # Hopper
     MIN_PERF=19000;
-    python hw_decoder_bench.py --width_hint 6000 --height_hint 6000 -b 408 -d 0 -g gpu -w 10 -t 10000 -i ${DALI_EXTRA_PATH}/db/single/jpeg -p rn50 -j 70 --hw_load 0.12 | tee ${LOG}
+    python hw_decoder_bench.py --width_hint 6000 --height_hint 6000 -b 408 -d 0 -g gpu -w 100 -t 100000 -i ${DALI_EXTRA_PATH}/db/single/jpeg -p rn50 -j 70 --hw_load 0.12 | tee ${LOG}
   else
     # GraceHopper
     MIN_PERF=29000;
-    python hw_decoder_bench.py --width_hint 6000 --height_hint 6000 -b 408 -d 0 -g gpu -w 10 -t 10000 -i ${DALI_EXTRA_PATH}/db/single/jpeg -p rn50 -j 72 --hw_load 0.11 | tee ${LOG}
+    python hw_decoder_bench.py --width_hint 6000 --height_hint 6000 -b 408 -d 0 -g gpu -w 100 -t 100000 -i ${DALI_EXTRA_PATH}/db/single/jpeg -p rn50 -j 72 --hw_load 0.11 | tee ${LOG}
   fi
   PERF=$(grep "fps" ${LOG} | awk '{print $1}')
 

From 2d9d526fa2909f0758336f39a48bae07e9bb2159 Mon Sep 17 00:00:00 2001
From: Janusz Lisiecki <39967756+JanuszL@users.noreply.github.com>
Date: Fri, 4 Oct 2024 12:27:09 +0200
Subject: [PATCH 07/29] Move to CUDA 12.6 update 2 (#5657)

Signed-off-by: Janusz Lisiecki <jlisiecki@nvidia.com>
---
 docker/Dockerfile.cuda126.aarch64.deps | 2 +-
 docker/Dockerfile.cuda126.x86_64.deps  | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/docker/Dockerfile.cuda126.aarch64.deps b/docker/Dockerfile.cuda126.aarch64.deps
index 22d77ad04d..90b2be9ab3 100644
--- a/docker/Dockerfile.cuda126.aarch64.deps
+++ b/docker/Dockerfile.cuda126.aarch64.deps
@@ -6,7 +6,7 @@ ENV DEBIAN_FRONTEND=noninteractive
 RUN apt update && apt install -y libxml2 curl perl gcc && \
     rm -rf /var/lib/apt/lists/*
 
-RUN curl -LO https://developer.download.nvidia.com/compute/cuda/12.6.1/local_installers/cuda_12.6.1_560.35.03_linux_sbsa.run && \
+RUN curl -LO https://developer.download.nvidia.com/compute/cuda/12.6.2/local_installers/cuda_12.6.2_560.35.03_linux_sbsa.run && \
     chmod +x cuda_*.run && \
     ./cuda_*.run --silent --no-opengl-libs --toolkit && \
     rm -f cuda_*.run;
diff --git a/docker/Dockerfile.cuda126.x86_64.deps b/docker/Dockerfile.cuda126.x86_64.deps
index ffc4913a26..fe39877268 100644
--- a/docker/Dockerfile.cuda126.x86_64.deps
+++ b/docker/Dockerfile.cuda126.x86_64.deps
@@ -6,7 +6,7 @@ ENV DEBIAN_FRONTEND=noninteractive
 RUN apt update && apt install -y libxml2 curl perl gcc && \
     rm -rf /var/lib/apt/lists/*
 
-RUN curl -LO https://developer.download.nvidia.com/compute/cuda/12.6.1/local_installers/cuda_12.6.1_560.35.03_linux.run && \
+RUN curl -LO https://developer.download.nvidia.com/compute/cuda/12.6.2/local_installers/cuda_12.6.2_560.35.03_linux.run && \
     chmod +x cuda_*.run && \
     ./cuda_*.run --silent --no-opengl-libs --toolkit && \
     rm -f cuda_*.run;
@@ -30,7 +30,7 @@ RUN NVJPEG2K_VERSION=0.8.0.38-1 && \
     rm -rf /var/lib/apt/lists/* && \
     mkdir nvcomp && \
     cd nvcomp && \
-    wget https://developer.download.nvidia.com/compute/nvcomp/${NVCOMP_VERSION}/local_installers/nvcomp_${NVCOMP_VERSION}_x86_64_${CUDA_VERSION_MAJOR}.x.tgz  && \
+    wget https://developer.download.nvidia.com/compute/nvcomp/${NVCOMP_VERSION}/local_installers/nvcomp_${NVCOMP_VERSION}_x86_64_${CUDA_VERSION_MAJOR}.x.tgz && \
     tar -xvf nvcomp*.tgz && \
     cp -rv include/nvcomp* /usr/local/cuda/include/ && \
     cp -v lib/*.so /usr/local/cuda/lib64/ && \

From a1fceb8cb2ce3c0360736e6ba4d4508abc5914c9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Micha=C5=82=20Zientkiewicz?= <michalz@nvidia.com>
Date: Mon, 7 Oct 2024 09:48:07 +0200
Subject: [PATCH 08/29] Bump required NumPy version to 1.23. (#5658)

Bump required NumPy version to 1.23 to get DLPack support.

Signed-off-by: Michal Zientkiewicz <michalz@nvidia.com>
---
 qa/setup_packages.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/qa/setup_packages.py b/qa/setup_packages.py
index a30f6d98a1..fc81e72064 100755
--- a/qa/setup_packages.py
+++ b/qa/setup_packages.py
@@ -479,8 +479,8 @@ def get_pyvers_name(self, url, cuda_version):
     PlainPackage(
         "numpy",
         [
-            PckgVer(">=1.17,<1.24", python_min_ver="3.8", python_max_ver="3.11"),
-            PckgVer(">=1.17,<2", python_min_ver="3.12", python_max_ver="3.12"),
+            PckgVer(">=1.23,<1.24", python_min_ver="3.8", python_max_ver="3.11"),
+            PckgVer(">=1.23,<2", python_min_ver="3.12", python_max_ver="3.12"),
         ],
     ),
     PlainPackage("opencv-python", [PckgVer("4.8.1.78", dependencies=["numpy<2"])]),

From f81c2e13e151697e6d1648667bef368185fb70cc Mon Sep 17 00:00:00 2001
From: Janusz Lisiecki <39967756+JanuszL@users.noreply.github.com>
Date: Mon, 7 Oct 2024 15:32:17 +0200
Subject: [PATCH 09/29] Update links in DALI readme (#5660)

* Update links in DALI readme

- updates link to the most recent roadmap GitHub issue
- adds GTC 2024 eBay talk in additional materials and success stories

Signed-off-by: Janusz Lisiecki <jlisiecki@nvidia.com>
---
 README.rst | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/README.rst b/README.rst
index d0125005f5..6d9b33ee0e 100644
--- a/README.rst
+++ b/README.rst
@@ -114,18 +114,19 @@ DALI success stories:
 ---------------------
 
 - `During Kaggle computer vision competitions <https://www.kaggle.com/code/theoviel/rsna-breast-baseline-faster-inference-with-dali>`__:
-  `"*DALI is one of the best things I have learned in this competition*" <https://www.kaggle.com/competitions/rsna-breast-cancer-detection/discussion/391059>`__
+  `"DALI is one of the best things I have learned in this competition" <https://www.kaggle.com/competitions/rsna-breast-cancer-detection/discussion/391059>`__
 - `Lightning Pose - state of the art pose estimation research model <https://www.ncbi.nlm.nih.gov/pmc/articles/PMC10168383/>`__
 - `To improve the resource utilization in Advanced Computing Infrastructure <https://arcwiki.rs.gsu.edu/en/dali/using_nvidia_dali_loader>`__
 - `MLPerf - the industry standard for benchmarking compute and deep learning hardware and software <https://developer.nvidia.com/blog/mlperf-hpc-v1-0-deep-dive-into-optimizations-leading-to-record-setting-nvidia-performance/>`__
+- `"we optimized major models inside eBay with the DALI framework" <https://www.nvidia.com/en-us/on-demand/session/gtc24-s62578/>`__
 
 ----
 
 DALI Roadmap
 ------------
 
-`The following issue represents <https://github.com/NVIDIA/DALI/issues/4578>`__ a high-level overview of our 2023 plan. You should be aware that this
-roadmap may change at any time and the order below does not reflect any type of priority.
+`The following issue represents <https://github.com/NVIDIA/DALI/issues/5320>`__ a high-level overview of our 2024 plan. You should be aware that this
+roadmap may change at any time and the order of its items does not reflect any type of priority.
 
 We strongly encourage you to comment on our roadmap and provide us feedback on the mentioned
 GitHub issue.
@@ -177,6 +178,8 @@ depending on your version.
 Additional Resources
 --------------------
 
+- GPU Technology Conference 2024; **Optimizing Inference Model Serving for Highest Performance at eBay**; Yiheng Wang:
+  `event <https://www.nvidia.com/en-us/on-demand/session/gtc24-s62578/>`__
 - GPU Technology Conference 2023; **Developer Breakout: Accelerating Enterprise Workflows With Triton Server and DALI**; Brandon Tuttle:
   `event <https://www.nvidia.com/en-us/on-demand/session/gtcspring23-se52140/>`__.
 - GPU Technology Conference 2023; **GPU-Accelerating End-to-End Geospatial Workflows**; Kevin Green:

From 988265ad0af0d318b159a4a6721874bbca78048f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Micha=C5=82=20Zientkiewicz?= <michalz@nvidia.com>
Date: Tue, 8 Oct 2024 11:20:49 +0200
Subject: [PATCH 10/29] Fix constness in (Const)SampleView. Improve
 diagnostics. (#5664)

* Fix constness in (Const)SampleView.
* Improve diagnostics by marking disallowed functions as deleted.

Signed-off-by: Michal Zientkiewicz <michalz@nvidia.com>
---
 dali/pipeline/data/sample_view.h       | 13 ++++++++++---
 dali/pipeline/data/sample_view_test.cc |  3 +++
 2 files changed, 13 insertions(+), 3 deletions(-)

diff --git a/dali/pipeline/data/sample_view.h b/dali/pipeline/data/sample_view.h
index f8536b908f..8e170fe0cf 100644
--- a/dali/pipeline/data/sample_view.h
+++ b/dali/pipeline/data/sample_view.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// Copyright (c) 2022-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -36,6 +36,8 @@ namespace dali {
 template <typename Backend, typename ptr_t>
 class SampleViewBase {
  public:
+  template <typename Ptr>
+  static constexpr bool is_mutable = !std::is_const_v<std::remove_pointer_t<Ptr>>;
   /**
    * @name Get the underlying pointer to data
    * @{
@@ -44,7 +46,7 @@ class SampleViewBase {
    * @brief Return an un-typed pointer to the underlying storage.
    */
   template <typename ptr_t_ = ptr_t>
-  std::enable_if_t<std::is_same<ptr_t_, void *>::value, void *> raw_mutable_data() {
+  std::enable_if_t<is_mutable<ptr_t_>, void *> raw_mutable_data() const {
     return data_;
   }
 
@@ -60,7 +62,7 @@ class SampleViewBase {
    * The calling type must match the underlying type of the buffer.
    */
   template <typename T, typename ptr_t_ = ptr_t>
-  inline std::enable_if_t<std::is_same<ptr_t_, void *>::value, T *> mutable_data() {
+  inline std::enable_if_t<is_mutable<ptr_t_>, T *> mutable_data() const {
     DALI_ENFORCE(
         type() == TypeTable::GetTypeId<T>(),
         make_string(
@@ -167,6 +169,11 @@ class ConstSampleView : public SampleViewBase<Backend, const void *> {
   using Base = SampleViewBase<Backend, const void *>;
   using Base::Base;
 
+  template <typename T>
+  T *mutable_data() const = delete;
+
+  void *raw_mutable_data() const = delete;
+
   ConstSampleView(const SampleView<Backend> &other)  // NOLINT
       : Base(other.raw_data(), other.shape(), other.type()) {}
 
diff --git a/dali/pipeline/data/sample_view_test.cc b/dali/pipeline/data/sample_view_test.cc
index 50f3c37637..513369e95e 100644
--- a/dali/pipeline/data/sample_view_test.cc
+++ b/dali/pipeline/data/sample_view_test.cc
@@ -46,6 +46,9 @@ TEST(SampleView, Constructors) {
   SampleView<CPUBackend> default_view{};
   compare(default_view, nullptr, {0}, DALI_NO_TYPE);
 
+  default_view = {};
+  compare(default_view, nullptr, {0}, DALI_NO_TYPE);
+
   int32_t data{};
   SampleView<CPUBackend> from_ptr{&data, {1, 2, 3}};
   compare(from_ptr, &data, {1, 2, 3}, DALI_INT32);

From f8a76a68aac7c90a2e07e2721febe26c612d06aa Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Micha=C5=82=20Zientkiewicz?= <michalz@nvidia.com>
Date: Thu, 10 Oct 2024 07:46:02 +0200
Subject: [PATCH 11/29] Fix Pipeline reference leak in PythonFunction. (#5668)

* Use a stub pipeline as PythonFunction's "current pipeline" to avoid pipeline self-referencing and self-deleting from within its ThreadPool.

---------

Signed-off-by: Michal Zientkiewicz <michalz@nvidia.com>
---
 .../python_function/python_function.cc        | 10 +++--
 .../dali/ops/_operators/python_function.py    |  5 ++-
 dali/python/nvidia/dali/pipeline.py           | 28 +++++++++++++
 .../dali/plugin/pytorch/_torch_function.py    |  2 +-
 .../python/operator_2/test_python_function.py | 39 +++++++------------
 5 files changed, 53 insertions(+), 31 deletions(-)

diff --git a/dali/operators/python_function/python_function.cc b/dali/operators/python_function/python_function.cc
index f392d077d1..255bd32d18 100644
--- a/dali/operators/python_function/python_function.cc
+++ b/dali/operators/python_function/python_function.cc
@@ -1,4 +1,4 @@
-// Copyright (c) 2019-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// Copyright (c) 2019-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -20,7 +20,11 @@ namespace dali {
 
 DALI_SCHEMA(PythonFunctionBase)
     .AddArg("function",
-            "Function object.",
+            R"code(A callable object that defines the function of the operator.
+
+.. warning::
+    The function must not hold a reference to the pipeline in which it is used. If it does,
+    a circular reference to the pipeline will form and the pipeline will never be freed.)code",
             DALI_PYTHON_OBJECT)
     .AddOptionalArg("num_outputs", R"code(Number of outputs.)code", 1)
     .AddOptionalArg<std::vector<TensorLayout>>("output_layouts",
@@ -41,7 +45,7 @@ a more universal data format, see :meth:`nvidia.dali.fn.dl_tensor_python_functio
 The function should not modify input tensors.
 
 .. warning::
-  This operator is not compatible with TensorFlow integration.
+    This operator is not compatible with TensorFlow integration.
 
 .. warning::
     When the pipeline has conditional execution enabled, additional steps must be taken to
diff --git a/dali/python/nvidia/dali/ops/_operators/python_function.py b/dali/python/nvidia/dali/ops/_operators/python_function.py
index 8d386c9ff6..9cca3cfcea 100644
--- a/dali/python/nvidia/dali/ops/_operators/python_function.py
+++ b/dali/python/nvidia/dali/ops/_operators/python_function.py
@@ -51,9 +51,10 @@ def __init__(self, function, num_outputs=1, **kwargs):
 
         def __call__(self, *inputs, **kwargs):
             inputs = ops._preprocess_inputs(inputs, impl_name, self._device, None)
-            self.pipeline = _Pipeline.current()
-            if self.pipeline is None:
+            curr_pipe = _Pipeline.current()
+            if curr_pipe is None:
                 _Pipeline._raise_pipeline_required("PythonFunction operator")
+            self.pipeline = curr_pipe._stub()
 
             for inp in inputs:
                 if not isinstance(inp, _DataNode):
diff --git a/dali/python/nvidia/dali/pipeline.py b/dali/python/nvidia/dali/pipeline.py
index 69d6aa7dcb..44137cd78d 100644
--- a/dali/python/nvidia/dali/pipeline.py
+++ b/dali/python/nvidia/dali/pipeline.py
@@ -26,6 +26,7 @@
 from threading import local as tls
 from . import data_node as _data_node
 import atexit
+import copy
 import ctypes
 import functools
 import inspect
@@ -1764,6 +1765,33 @@ def _generate_build_args(self):
             for (name, dev), dtype, ndim in zip(self._names_and_devices, dtypes, ndims)
         ]
 
+    def _stub(self):
+        """Produce a stub by shallow-copying the pipeline, removing the backend and forbidding
+        operations that require the backend.
+
+        Stub pipelines are necessary in contexts where passing the actual pipeline would cause
+        circular reference - notably, PythonFunction operator.
+        """
+        stub = copy.copy(self)
+        stub._pipe = None
+
+        def short_circuit(self, *args, **kwargs):
+            raise RuntimeError("This method is forbidden in current context")
+
+        stub.start_py_workers = short_circuit
+        stub.build = short_circuit
+        stub.run = short_circuit
+        stub.schedule_run = short_circuit
+        stub.outputs = short_circuit
+        stub.share_outputs = short_circuit
+        stub.release_outputs = short_circuit
+        stub.add_sink = short_circuit
+        stub.checkpoint = short_circuit
+        stub.set_outputs = short_circuit
+        stub.executor_statistics = short_circuit
+        stub.external_source_shm_statistics = short_circuit
+        return stub
+
 
 def _shutdown_pipelines():
     for weak in list(Pipeline._pipes):
diff --git a/dali/python/nvidia/dali/plugin/pytorch/_torch_function.py b/dali/python/nvidia/dali/plugin/pytorch/_torch_function.py
index 05a2df95f8..922bb29139 100644
--- a/dali/python/nvidia/dali/plugin/pytorch/_torch_function.py
+++ b/dali/python/nvidia/dali/plugin/pytorch/_torch_function.py
@@ -56,7 +56,7 @@ def torch_wrapper(self, batch_processing, function, device, *args):
             )
 
     def __call__(self, *inputs, **kwargs):
-        pipeline = Pipeline.current()
+        pipeline = Pipeline.current()._stub()
         if pipeline is None:
             Pipeline._raise_pipeline_required("TorchPythonFunction")
         if self.stream is None:
diff --git a/dali/test/python/operator_2/test_python_function.py b/dali/test/python/operator_2/test_python_function.py
index 4fbb14788f..ae8214c49c 100644
--- a/dali/test/python/operator_2/test_python_function.py
+++ b/dali/test/python/operator_2/test_python_function.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2019-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -649,30 +649,6 @@ def py_fun_pipeline():
     pipe.run()
 
 
-def verify_pipeline(pipeline, input):
-    assert pipeline is Pipeline.current()
-    return input
-
-
-def test_current_pipeline():
-    pipe1 = Pipeline(13, 4, 0)
-    with pipe1:
-        dummy = types.Constant(numpy.ones((1)))
-        output = fn.python_function(dummy, function=lambda inp: verify_pipeline(pipe1, inp))
-        pipe1.set_outputs(output)
-
-    pipe2 = Pipeline(6, 2, 0)
-    with pipe2:
-        dummy = types.Constant(numpy.ones((1)))
-        output = fn.python_function(dummy, function=lambda inp: verify_pipeline(pipe2, inp))
-        pipe2.set_outputs(output)
-
-    pipe1.build()
-    pipe2.build()
-    pipe1.run()
-    pipe2.run()
-
-
 @params(
     numpy.bool_,
     numpy.int_,
@@ -716,3 +692,16 @@ def test_pipe():
     pipe.build()
 
     _ = pipe.run()
+
+
+def test_delete_pipe_while_function_running():
+    def func(x):
+        time.sleep(0.02)
+        return x
+
+    for i in range(5):
+        with Pipeline(batch_size=1, num_threads=1, device_id=None) as pipe:
+            pipe.set_outputs(fn.python_function(types.Constant(0), function=func))
+            pipe.build()
+            pipe.run()
+        del pipe

From 7a51e09796aa27d26250a8acbd3cc445148778f5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Micha=C5=82=20Zientkiewicz?= <michalz@nvidia.com>
Date: Thu, 10 Oct 2024 07:47:34 +0200
Subject: [PATCH 12/29] DLPack support rework (#5661)

Refactor DLTensorResource
- Remove inheritance.
- Make more stuff inline.
- Make function names consistent.
- Add DLPack resources which share tensor ownership
- Add reference counting tests.
- Add CPU pinned memory support.
- Remove duplicated from JAX plugin

---------

Signed-off-by: Michal Zientkiewicz <michalz@nvidia.com>
---
 .../python_function/dltensor_function.cc      |  41 ++-
 .../python_function/dltensor_function.h       |   6 +-
 dali/operators/python_function/jax_function.h |  56 +---
 dali/pipeline/data/dltensor.cc                |  46 +--
 dali/pipeline/data/dltensor.h                 | 303 ++++++++++++++++--
 dali/pipeline/data/dltensor_test.cc           | 265 ++++++++++-----
 dali/pipeline/util/copy_with_stride_test.cc   |  21 +-
 dali/python/backend_impl.cc                   |   6 +-
 dali/util/pybind.h                            |   6 +-
 9 files changed, 508 insertions(+), 242 deletions(-)

diff --git a/dali/operators/python_function/dltensor_function.cc b/dali/operators/python_function/dltensor_function.cc
index c97edf7c25..6722103864 100644
--- a/dali/operators/python_function/dltensor_function.cc
+++ b/dali/operators/python_function/dltensor_function.cc
@@ -81,7 +81,7 @@ py::list PrepareDLTensorInputs<CPUBackend>(Workspace &ws) {
     py::list dl_tensor_list;
     auto &input = ws.UnsafeMutableInput<CPUBackend>(idx);
     for (Index i = 0; i < ws.GetInputBatchSize(idx); ++i) {
-      auto dl_capsule = TensorToDLPackView(input[i], input.device_id());
+      auto dl_capsule = TensorToDLPackView(input[i], input.is_pinned(), input.device_id());
       dl_tensor_list.append(dl_capsule);
     }
     input_tuple.append(dl_tensor_list);
@@ -109,7 +109,7 @@ py::list PrepareDLTensorInputsPerSample<CPUBackend>(Workspace &ws) {
     py::list tuple;
     for (Index idx = 0; idx < ws.NumInput(); ++idx) {
       auto &input = ws.UnsafeMutableInput<CPUBackend>(idx);
-      auto dl_capsule = TensorToDLPackView(input[s], input.device_id());
+      auto dl_capsule = TensorToDLPackView(input[s], input.is_pinned(), input.device_id());
       tuple.append(dl_capsule);
     }
     input_tuples.append(tuple);
@@ -193,29 +193,42 @@ struct PyBindInitializer {
 // so this workaround initializes them manually
 static PyBindInitializer pybind_initializer{}; // NOLINT
 
-struct DLTensorNumpyResource: public DLTensorResource {
-  explicit DLTensorNumpyResource(const py::array &array)
-      : DLTensorResource(TensorShape<>(array.shape(), array.shape() + array.ndim()))
-      , array(array) {
-    strides.resize(array.ndim());
+struct PyArrayPayload : TensorViewPayload {
+  explicit PyArrayPayload(const py::array &array) : array(array) {
+    shape = TensorShape<>(array.shape(), array.shape() + array.ndim());
+    strides.resize(shape.size());
     auto itemsize = array.dtype().itemsize();
     for (int i = 0; i < array.ndim(); ++i) {
       strides[i] = array.strides(i) / itemsize;
     }
   }
-
   py::array array;
-
-  ~DLTensorNumpyResource() override = default;
 };
 
+
+using DLTensorNumpyResource = DLTensorResource<PyArrayPayload>;
+
+auto GetDLTensorResource(const py::array &array) {
+  auto rsrc = DLTensorNumpyResource::Create(array);
+  auto &tensor = rsrc->dlm_tensor.dl_tensor;
+  auto buffer = rsrc->payload.array.request();
+  tensor.data = buffer.ptr;
+  tensor.shape = rsrc->payload.shape.data();
+  tensor.ndim = rsrc->payload.shape.size();
+  tensor.strides = rsrc->payload.strides.empty() ? nullptr : rsrc->payload.strides.data();
+  tensor.device = ToDLDevice(false, false, 0);
+  tensor.dtype = ToDLType(TypeFromFormatStr(buffer.format).id());
+  return rsrc;
+}
+
+
 PYBIND11_MODULE(python_function_plugin, m) {
   m.def("current_dali_stream", []() { return reinterpret_cast<uint64_t>(GetCurrentStream()); });
 
   m.def("DLTensorToArray", [](py::capsule dl_capsule) {
     auto dlm_tensor_ptr = DLMTensorPtrFromCapsule(dl_capsule);
     const auto &dl_tensor = dlm_tensor_ptr->dl_tensor;
-    auto dali_type = DLToDALIType(dl_tensor.dtype);
+    auto dali_type = ToDALIType(dl_tensor.dtype);
     py::dtype dtype(FormatStrFromType(dali_type));
     auto shape = make_span(dl_tensor.shape, dl_tensor.ndim);
     py::array array;
@@ -233,10 +246,8 @@ PYBIND11_MODULE(python_function_plugin, m) {
   });
 
   m.def("ArrayToDLTensor", [](py::array array) {
-    auto buffer = array.request();
-    auto dlm_tensor_ptr = MakeDLTensor(buffer.ptr, TypeFromFormatStr(buffer.format).id(),
-                                       false, 0, std::make_unique<DLTensorNumpyResource>(array));
-    return DLTensorToCapsule(std::move(dlm_tensor_ptr));
+    auto rsrc = GetDLTensorResource(array);
+    return DLTensorToCapsule(ToDLMTensor(std::move(rsrc)));
   });
 
   // For the _a suffix
diff --git a/dali/operators/python_function/dltensor_function.h b/dali/operators/python_function/dltensor_function.h
index 909bb3bbf3..71a3a9bd32 100644
--- a/dali/operators/python_function/dltensor_function.h
+++ b/dali/operators/python_function/dltensor_function.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2019-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// Copyright (c) 2019-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -65,7 +65,7 @@ std::vector<DLMTensorPtr> CastToDLTensorList(py::list &list, Index exp_size, Ind
       result.push_back(DLMTensorPtrFromCapsule(caps));
       DALI_ENFORCE(result[i]->dl_tensor.device.device_type == Backend2DLDevice<Backend>(),
                    "Wrong output backend.");
-      DALI_ENFORCE(DLToDALIType(result[i]->dl_tensor.dtype) == DLToDALIType(dtype),
+      DALI_ENFORCE(ToDALIType(result[i]->dl_tensor.dtype) == ToDALIType(dtype),
                    "Output DLPack tensor list should have consistent data type.");
       DALI_ENFORCE(result[i]->dl_tensor.ndim == ndim,
                    "All samples in the batch should have the same number of dimensions.");
@@ -94,7 +94,7 @@ void PrepareOutputs(Workspace &ws, const py::object &output_o, int batch_size) {
     auto dl_tensors = CastToDLTensorList<Backend>(dl_list, batch_size, idx);
     if (dl_tensors.empty()) continue;
     auto &tlist = ws.Output<Backend>(idx);
-    tlist.Resize(GetDLTensorListShape(dl_tensors), DLToDALIType(dl_tensors[0]->dl_tensor.dtype));
+    tlist.Resize(GetDLTensorListShape(dl_tensors), ToDALIType(dl_tensors[0]->dl_tensor.dtype));
     CopyOutputData(tlist, dl_tensors, ws);
   }
 }
diff --git a/dali/operators/python_function/jax_function.h b/dali/operators/python_function/jax_function.h
index bf23c04ed8..1ae33239d5 100644
--- a/dali/operators/python_function/jax_function.h
+++ b/dali/operators/python_function/jax_function.h
@@ -31,52 +31,6 @@ namespace dali {
 
 namespace detail {
 
-template <typename Backend>
-struct DLDaliTensorResource {
-  explicit DLDaliTensorResource(Tensor<Backend> &&tensor)
-      : tensor(std::move(tensor)), dlm_tensor{} {}
-  Tensor<Backend> tensor;
-  DLManagedTensor dlm_tensor;
-};
-
-
-template <typename Backend>
-DLMTensorPtr SetupTensorResource(std::unique_ptr<DLDaliTensorResource<Backend>> tensor_resource) {
-  static_assert(std::is_same_v<Backend, GPUBackend> || std::is_same_v<Backend, CPUBackend>);
-  Tensor<Backend> &tensor = tensor_resource->tensor;
-  DLManagedTensor &dlm_tensor = tensor_resource->dlm_tensor;
-
-  // copy relevant meta-data from the tensor to dl pack struct
-  dlm_tensor.dl_tensor.dtype = GetDLType(tensor.type());
-  dlm_tensor.dl_tensor.data = tensor.raw_mutable_data();
-  TensorShape<> &tensor_shape = const_cast<TensorShape<> &>(tensor.shape());
-  dlm_tensor.dl_tensor.ndim = tensor_shape.size();
-  dlm_tensor.dl_tensor.shape = tensor_shape.begin();
-  if (std::is_same_v<Backend, GPUBackend>) {
-    dlm_tensor.dl_tensor.device = {kDLCUDA, tensor.device_id()};
-  } else {
-    dlm_tensor.dl_tensor.device = {kDLCPU, 0};
-  }
-
-  // transfer ownership of both tensor and dl pack structure to dl pack structure
-  // and re-expose it as a unique_ptr to just dlpack structure
-  DLDaliTensorResource<Backend> *raw_ptr = tensor_resource.release();
-  dlm_tensor.manager_ctx = raw_ptr;
-  dlm_tensor.deleter = [](DLManagedTensor *dlm_tensor) {
-    delete static_cast<DLDaliTensorResource<Backend> *>(dlm_tensor->manager_ctx);
-  };
-  return {&raw_ptr->dlm_tensor,
-          [](DLManagedTensor *dlm_tensor) { dlm_tensor->deleter(dlm_tensor); }};
-}
-
-template <typename Backend>
-DLMTensorPtr AsDLTensor(Tensor<Backend> &&tensor) {
-  static_assert(std::is_same_v<Backend, GPUBackend> || std::is_same_v<Backend, CPUBackend>);
-  // 1. allocate DLDaliTensorResource that will hold dlpack struct and the actual tensor together
-  // 2. copy relevant meta-data from the tensor to managed dl pack struct
-  return SetupTensorResource(std::make_unique<DLDaliTensorResource<Backend>>(std::move(tensor)));
-}
-
 /**
  * @brief Exposes vector of tensors as a Python list of DLTensorObjs.
  *
@@ -90,12 +44,12 @@ DLMTensorPtr AsDLTensor(Tensor<Backend> &&tensor) {
  * @return py::list of DlTensorObjs.
  */
 template <typename Backend>
-py::list TensorsAsDLTensorObjs(std::vector<Tensor<Backend>> &&tensors,
+py::list TensorsAsDLTensorObjs(std::vector<Tensor<Backend>> &tensors,
                                std::optional<cudaStream_t> producer_stream) {
   py::list dl_tensor_objs;
   for (size_t i = 0; i < tensors.size(); ++i) {
     dl_tensor_objs.append(
-        py::cast(DLTensorObj{AsDLTensor(std::move(tensors[i])), producer_stream}));
+        py::cast(DLTensorObj{GetSharedDLTensor(tensors[i]), producer_stream}));
   }
   return dl_tensor_objs;
 }
@@ -243,14 +197,14 @@ class JaxFunction : public StatelessOperator<Backend> {
                                               std::vector<Tensor<Backend>> &&batched_inputs) {
     py::gil_scoped_acquire interpreter_guard{};
     if constexpr (std::is_same_v<Backend, CPUBackend>) {
-      auto dl_inputs = detail::TensorsAsDLTensorObjs(std::move(batched_inputs), std::nullopt);
+      auto dl_inputs = detail::TensorsAsDLTensorObjs(batched_inputs, std::nullopt);
       auto dl_outputs = python_function_(*dl_inputs);
       return ConsumePythonOutputs(ws, std::move(dl_outputs));
     } else if constexpr (!std::is_same_v<Backend, CPUBackend>) {  // NOLINT
       static_assert(std::is_same_v<Backend, GPUBackend>,
                     "The operator supports only CPU and GPU backends");
       cudaStream_t stream = ws.stream();
-      auto dl_inputs = detail::TensorsAsDLTensorObjs(std::move(batched_inputs), stream);
+      auto dl_inputs = detail::TensorsAsDLTensorObjs(batched_inputs, stream);
       auto dl_outputs = python_function_(reinterpret_cast<int64_t>(stream), *dl_inputs);
       return ConsumePythonOutputs(ws, std::move(dl_outputs));
     }
@@ -313,7 +267,7 @@ class JaxFunction : public StatelessOperator<Backend> {
       TensorListShape<> batch_shape =
           uniform_list_shape(batch_size, dl_batch_shape.last(dl_batch_shape.size() - 1));
 
-      auto dtype = DLToDALIType(dl_batch.dtype);
+      auto dtype = ToDALIType(dl_batch.dtype);
       auto type_info = dali::TypeTable::GetTypeInfo(dtype);
       size_t size_of_dtype = type_info.size();
       int64_t bytes = dl_batch_shape.num_elements() * size_of_dtype;
diff --git a/dali/pipeline/data/dltensor.cc b/dali/pipeline/data/dltensor.cc
index 0e71459dbd..fff05936ee 100644
--- a/dali/pipeline/data/dltensor.cc
+++ b/dali/pipeline/data/dltensor.cc
@@ -1,4 +1,4 @@
-// Copyright (c) 2019-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// Copyright (c) 2019-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -19,18 +19,18 @@
 
 namespace dali {
 
-DLDataType GetDLType(DALIDataType type) {
+DLDataType ToDLType(DALIDataType type) {
   DLDataType dl_type{};
   TYPE_SWITCH(type, type2id, T, (DALI_NUMERIC_TYPES_FP16, bool), (
     dl_type.bits = sizeof(T) * 8;
       dl_type.lanes = 1;
-      if (dali::is_fp_or_half<T>::value) {
+      if constexpr (dali::is_fp_or_half<T>::value) {
         dl_type.code = kDLFloat;
-      } else if (std::is_same<T, bool>::value) {
+      } else if constexpr (std::is_same_v<T, bool>) {
         dl_type.code = kDLBool;
-      } else if (std::is_unsigned<T>::value) {
+      } else if constexpr (std::is_unsigned_v<T>) {
         dl_type.code = kDLUInt;
-      } else if (std::is_integral<T>::value) {
+      } else if constexpr (std::is_integral_v<T>) {
         dl_type.code = kDLInt;
       } else {
         DALI_FAIL(make_string("This data type (", type, ") cannot be handled by DLTensor."));
@@ -39,45 +39,13 @@ DLDataType GetDLType(DALIDataType type) {
   return dl_type;
 }
 
-void DLManagedTensorDeleter(DLManagedTensor *self) {
-  delete static_cast<DLTensorResource*>(self->manager_ctx);
-}
-
 void DLMTensorPtrDeleter(DLManagedTensor* dlm_tensor_ptr) {
   if (dlm_tensor_ptr && dlm_tensor_ptr->deleter) {
     dlm_tensor_ptr->deleter(dlm_tensor_ptr);
   }
 }
 
-DLMTensorPtr MakeDLTensor(void* data, DALIDataType type,
-                          bool device, int device_id,
-                          std::unique_ptr<DLTensorResource> resource) {
-  DLManagedTensor *dlm_tensor_ptr = &resource->dlm_tensor;
-  DLTensor &dl_tensor = dlm_tensor_ptr->dl_tensor;
-  dl_tensor.data = data;
-  dl_tensor.ndim = resource->shape.size();
-  dl_tensor.shape = resource->shape.begin();
-  if (!resource->strides.empty()) {
-    dl_tensor.strides = resource->strides.data();
-  }
-  if (device) {
-    dl_tensor.device = {kDLCUDA, device_id};
-  } else {
-    dl_tensor.device = {kDLCPU, 0};
-  }
-  dl_tensor.dtype = GetDLType(type);
-  dlm_tensor_ptr->deleter = &DLManagedTensorDeleter;
-  dlm_tensor_ptr->manager_ctx = resource.release();
-  return {dlm_tensor_ptr, &DLMTensorPtrDeleter};
-}
-
-inline std::string to_string(const DLDataType &dl_type) {
-  return std::string("{code: ")
-    + (dl_type.code ? ((dl_type.code == 2) ? "kDLFloat" : "kDLUInt") : "kDLInt")
-    + ", bits: " + std::to_string(dl_type.bits) + ", lanes: " + std::to_string(dl_type.lanes) + "}";
-}
-
-DALIDataType DLToDALIType(const DLDataType &dl_type) {
+DALIDataType ToDALIType(const DLDataType &dl_type) {
   DALI_ENFORCE(dl_type.lanes == 1,
                "DALI Tensors do not support types with the number of lanes other than 1");
   switch (dl_type.code) {
diff --git a/dali/pipeline/data/dltensor.h b/dali/pipeline/data/dltensor.h
index d97acedab2..90353b5b3d 100644
--- a/dali/pipeline/data/dltensor.h
+++ b/dali/pipeline/data/dltensor.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2019-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// Copyright (c) 2019-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -15,7 +15,11 @@
 #ifndef DALI_PIPELINE_DATA_DLTENSOR_H_
 #define DALI_PIPELINE_DATA_DLTENSOR_H_
 
+#include <cassert>
 #include <memory>
+#include <optional>
+#include <sstream>
+#include <string>
 #include <utility>
 #include <vector>
 #include "third_party/dlpack/include/dlpack/dlpack.h"
@@ -25,55 +29,296 @@
 
 namespace dali {
 
-DLL_PUBLIC void DLManagedTensorDeleter(DLManagedTensor *self);
-
-DLL_PUBLIC void DLMTensorPtrDeleter(DLManagedTensor *ptr);
+//////////////////////////////////////////////////////////////////////////////
+// DLPack utilities
 
 using DLMTensorPtr = std::unique_ptr<DLManagedTensor, void(*)(DLManagedTensor*)>;
 
-DLL_PUBLIC DLDataType GetDLType(DALIDataType type);
+/** A deleter which calls `DLManagedTensor::deleter` */
+DLL_PUBLIC void DLMTensorPtrDeleter(DLManagedTensor* dlm_tensor_ptr);
+
+/** Converts a DALI type to DLPack type. */
+DLL_PUBLIC DLDataType ToDLType(DALIDataType type);
+
+/** Converts a DLPack type to DALI type. */
+DLL_PUBLIC DALIDataType ToDALIType(const DLDataType &dl_type);
+
+/** Returns type string for given DLPack type
+ *
+ * The text representation looks like:
+ * <type><bits>[x<lanes>]
+ * with x<lanes> present only if the number of lanes is > 1
+ *
+ * Examples:
+ * u8     - 8-bit unsigned integer
+ * i32    - 32-bit signed integer
+ * f64    - 64-bit floating point number
+ * bf16   - bfloat16
+ * b8     - 8-bit boolean
+ * c64    - 64-bit complex number
+ * f32x4 - 128-bit vector consisting of 4 32-bit floating point numbers
+ *
+ * If the code is unknown, the type code is replaced by '<unknown:value>' - a type with an unknown
+ * code 42, 2 lanes and 32-bits would look like <unknown:42>32x2
+ */
+inline std::string to_string(const DLDataType &dl_type) {
+  const char *code_str[] = {
+    "i", "u", "f", "p", "bf", "c", "b"
+  };
+  std::stringstream ss;
+  if (dl_type.code < std::size(code_str))
+    ss << code_str[dl_type.code];
+  else
+    ss << "<unknown:" << dl_type.code + 0 << ">";
+  ss << dl_type.bits + 0;
+  if (dl_type.lanes > 1)
+    ss << 'x' << dl_type.lanes + 0;
+  return ss.str();
+}
+
+inline std::ostream &operator<<(std::ostream &os, const DLDataType &dl_type) {
+  return os << to_string(dl_type);
+}
+
+constexpr DLDevice ToDLDevice(bool is_device, bool is_pinned, int device_id) {
+  if (is_device)
+    return {kDLCUDA, device_id};
+  else
+    return {is_pinned ? kDLCUDAHost : kDLCPU, 0};
+}
+
+//////////////////////////////////////////////////////////////////////////////
+// DLTensorResource
+
+/** Default non-owning payload for DLPack tensors. */
+struct TensorViewPayload {
+  TensorShape<> shape, strides;
+};
+
+/** Default ownership-sharing payload for DLPack tensors. */
+struct SharedTensorPayload : TensorViewPayload {
+  std::shared_ptr<void> data;
 
+  SharedTensorPayload() = default;
+  SharedTensorPayload(TensorShape<> shape, TensorShape<> strides, std::shared_ptr<void> data)
+  : TensorViewPayload{ std::move(shape), std::move(strides) }
+  , data(std::move(data)) {}
+};
+
+
+/** A wrapper for DLManagedTensor along with its `context_manager`.
+ *
+ * This is a non-intuitive circular-reference structure.
+ * DLManagedTensor lives inside the "resource", but the context_manager points to the resource.
+ *
+ * The diagram below depicts a typical relationship between DLTensorResource and its members:
+ *
+ * ```
+ * DLTensorResource   <-------------+
+ * |                                |
+ * +-- DLManagedTensor              |
+ * |   |                            |
+ * |   +-- DLTensor                 |
+ * |   |   |                        |
+ * |   |   +-- *shape --------+     |
+ * |   |   +-- *strides ------)--+  |
+ * |   |   +-- ...            |  |  |
+ * |   |                      |  |  |
+ * |   +- *context_manager ---)--)--+
+ * |   +- *deleter            |  |
+ * |                          |  |
+ * +-- Payload                |  |
+ *     |                      |  |
+ *     +-- shape  <-----------+  |
+ *     +-- strides  <------------+
+ *     +-- ...
+ * ```
+ *
+ * You can use any payload structure of your choice, but it must provide the storage for DLTensor's
+ * `shapes` (and `strides`, if necessary).
+ */
+template <typename Payload>
 struct DLTensorResource {
-  explicit DLTensorResource(TensorShape<> shape)
-  : shape(std::move(shape))
-  , strides() {}
+  template <typename... PayloadArgs>
+  explicit DLTensorResource(PayloadArgs &&...args)
+  : dlm_tensor{{}, this, dlm_deleter}
+  , payload{std::forward<PayloadArgs>(args)...} {}
+
 
-  TensorShape<> shape;
-  TensorShape<> strides;
   DLManagedTensor dlm_tensor{};
+  Payload payload;
 
-  virtual ~DLTensorResource() = default;
+  template <typename... PayloadArgs>
+  static std::unique_ptr<DLTensorResource> Create(PayloadArgs &&...args) {
+    return std::make_unique<DLTensorResource>(std::forward<PayloadArgs>(args)...);
+  }
+
+  static void dlm_deleter(DLManagedTensor *tensor) {
+    if (tensor == nullptr)
+      return;
+    auto *This = static_cast<DLTensorResource *>(tensor->manager_ctx);
+    assert(&This->dlm_tensor == tensor);  // is that always the case?
+    delete This;
+  }
 };
 
-DLL_PUBLIC DLMTensorPtr MakeDLTensor(void *data, DALIDataType type,
-                                     bool device, int device_id,
-                                     std::unique_ptr<DLTensorResource> resource);
+/** Type-erases the DLTensorResource and returns a smart pointer to the contained DLManagedTensor.
+ */
+template <typename Payload>
+DLMTensorPtr ToDLMTensor(std::unique_ptr<DLTensorResource<Payload>> rsrc) {
+  return { &rsrc.release()->dlm_tensor, DLMTensorPtrDeleter };
+}
+
+namespace detail {
+/** Populates the DLTensor stored in `rsrc`. Shapes and strides will point to `rsrc.payload`. */
+template <typename Payload>
+void InitResourceDLTensor(DLTensorResource<Payload> &rsrc,
+                          void *data, DALIDataType type,
+                          bool device, bool pinned, int device_id) {
+  auto &tensor = rsrc.dlm_tensor.dl_tensor;
+  tensor = {};
+  tensor.data = data;
+  tensor.shape = rsrc.payload.shape.data();
+  tensor.ndim = rsrc.payload.shape.size();
+  tensor.strides = rsrc.payload.strides.empty() ? nullptr : rsrc.payload.strides.data();
+  tensor.device = ToDLDevice(device, pinned, device_id);
+  tensor.dtype = ToDLType(type);
+}
+}  // namespace detail
 
+/** Constructs a DLTensorResource WITHOUT data ownership. */
+inline auto MakeDLTensorResource(void *data, DALIDataType type,
+                                 bool device, bool pinned, int device_id,
+                                 const TensorShape<> &shape,
+                                 const TensorShape<> &strides = {}) {
+  if (!strides.empty() && strides.size() != shape.size())
+    throw std::invalid_argument("If `strides` are not empty they must have the same number "
+                                "of elements as `shape`.");
+  auto rsrc = DLTensorResource<TensorViewPayload>::Create(shape, strides);
+  detail::InitResourceDLTensor(*rsrc, data, type, device, pinned, device_id);
+  return rsrc;
+}
+
+/** Constructs a DLManagedTensor WITHOUT data ownership. */
+inline DLMTensorPtr MakeDLTensor(void *data, DALIDataType type,
+                                 bool device, bool pinned, int device_id,
+                                 const TensorShape<> &shape,
+                                 const TensorShape<> &strides = {}) {
+  return ToDLMTensor(MakeDLTensorResource(data, type, device, pinned, device_id, shape, strides));
+}
+
+/** Constructs a DLTensorResource sharing the data ownership. */
+inline auto MakeDLTensorResource(std::shared_ptr<void> data, DALIDataType type,
+                                 bool device, bool pinned, int device_id,
+                                 const TensorShape<> &shape,
+                                 const TensorShape<> &strides = {}) {
+  if (!strides.empty() && strides.size() != shape.size())
+    throw std::invalid_argument("If `strides` are not empty they must have the same number "
+                                "of elements as `shape`.");
+  auto rsrc = DLTensorResource<SharedTensorPayload>::Create(shape, strides, std::move(data));
+  detail::InitResourceDLTensor(*rsrc, rsrc->payload.data.get(), type, device, pinned, device_id);
+  return rsrc;
+}
+
+/** Constructs a DLManagedTensor sharing the data ownership. */
+inline DLMTensorPtr MakeDLTensor(std::shared_ptr<void> data, DALIDataType type,
+                                 bool device, bool pinned, int device_id,
+                                 const TensorShape<> &shape,
+                                 const TensorShape<> &strides = {}) {
+  return ToDLMTensor(MakeDLTensorResource(
+    std::move(data), type, device, pinned, device_id, shape, strides));
+}
+
+/** Gets a DLManagedTensor which does not hold a reference on the data.
+ *
+ * This function constructs a DLTensor whose context manager stores only the shape data.
+ * The returned DLPack tensor must not outlive the original Tensor.
+ */
 template <typename Backend>
-DLMTensorPtr GetDLTensorView(SampleView<Backend> tensor, int device_id) {
-  return MakeDLTensor(tensor.raw_mutable_data(),
-                      tensor.type(),
-                      std::is_same<Backend, GPUBackend>::value,
-                      device_id,
-                      std::make_unique<DLTensorResource>(tensor.shape()));
+DLMTensorPtr GetDLTensorView(const SampleView<Backend> &tensor, bool pinned, int device_id) {
+  auto rsrc = MakeDLTensorResource(
+                  tensor.raw_mutable_data(), tensor.type(),
+                  std::is_same_v<Backend, GPUBackend>, pinned, device_id,
+                  tensor.shape());
+  return ToDLMTensor(std::move(rsrc));
 }
 
+
+/** Gets a list of DLManagedTensors which do not hold a reference on the data.
+ *
+ * This function constructs a list of DLTensors whose context managers store only the shape data.
+ * The returned DLPack tensors must not outlive the original TensorList.
+ */
 template <typename Backend>
 std::vector<DLMTensorPtr> GetDLTensorListView(TensorList<Backend> &tensor_list) {
+  int device_id = tensor_list.device_id();
+  bool pinned = tensor_list.is_pinned();
+
   std::vector<DLMTensorPtr> dl_tensors{};
   dl_tensors.reserve(tensor_list.num_samples());
-  for (int i = 0; i < tensor_list.num_samples(); ++i) {
-    const auto &shape = tensor_list.tensor_shape(i);
-    dl_tensors.push_back(MakeDLTensor(tensor_list.raw_mutable_tensor(i),
-                                      tensor_list.type(),
-                                      std::is_same<Backend, GPUBackend>::value,
-                                      tensor_list.device_id(),
-                                      std::make_unique<DLTensorResource>(shape)));
-  }
+
+  for (int i = 0; i < tensor_list.num_samples(); ++i)
+    dl_tensors.push_back(GetDLTensorView(tensor_list[i], pinned, device_id));
+  return dl_tensors;
+}
+
+
+/** Gets a DLManagedTensor which shares the buffer ownership with a tensor.
+ *
+ * This function constructs a DLTensor whose context manager stores a shared pointer to the
+ * tensor contents.
+ * It can be used to remove data ownership from DALI to an external library.
+ */
+template <typename Backend>
+DLMTensorPtr GetSharedDLTensor(Tensor<Backend> &tensor) {
+  auto rsrc = MakeDLTensorResource(
+                  tensor.get_data_ptr(), tensor.type(),
+                  std::is_same_v<Backend, GPUBackend>, tensor.is_pinned(), tensor.device_id(),
+                  tensor.shape());
+  return ToDLMTensor(std::move(rsrc));
+}
+
+/** Gets a DLManagedTensor which shares the buffer ownership with a tensor.
+ *
+ * This function constructs a DLTensor whose context manager stores a shared pointer to the
+ * tensor contents.
+ * It can be used to remove data ownership from DALI to an external library.
+ */
+template <typename Backend>
+DLMTensorPtr GetSharedDLTensor(const SampleView<Backend> &tensor,
+                               std::shared_ptr<void> data, bool pinned, int device_id) {
+  assert(tensor.raw_mutable_data() == data.get());
+  auto rsrc = MakeDLTensorResource(
+                  std::move(data), tensor.type(),
+                  std::is_same_v<Backend, GPUBackend>, pinned, device_id,
+                  tensor.shape());
+  return ToDLMTensor(std::move(rsrc));
+}
+
+
+/** Gets a vector of DLManagedTensors which share the buffer ownership with a TensorList.
+ *
+ * This function constructs a list DLTensor whose context managers store shared pointers to the
+ * samples in the TensorList.
+ * It can be used to remove data ownership from DALI to an external library.
+ */
+template <typename Backend>
+std::vector<DLMTensorPtr> GetSharedDLTensorList(TensorList<Backend> &tensor_list) {
+  int device_id = tensor_list.device_id();
+  bool pinned = tensor_list.is_pinned();
+
+  std::vector<DLMTensorPtr> dl_tensors{};
+  dl_tensors.reserve(tensor_list.num_samples());
+
+  for (int i = 0; i < tensor_list.num_samples(); ++i)
+    dl_tensors.push_back(GetSharedDLTensor(
+        tensor_list[i],
+        unsafe_sample_owner(tensor_list, i),
+        pinned,
+        device_id));
   return dl_tensors;
 }
 
-DLL_PUBLIC DALIDataType DLToDALIType(const DLDataType &dl_type);
 
 }  // namespace dali
 #endif  // DALI_PIPELINE_DATA_DLTENSOR_H_
diff --git a/dali/pipeline/data/dltensor_test.cc b/dali/pipeline/data/dltensor_test.cc
index ede7b18504..d5b13fa7e3 100644
--- a/dali/pipeline/data/dltensor_test.cc
+++ b/dali/pipeline/data/dltensor_test.cc
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// Copyright (c) 2017-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -21,118 +21,217 @@
 
 namespace dali {
 
-TEST(DLMTensorPtr, CPU) {
+TEST(DLPackTest, DLType) {
+  DLDataType dl;
+  for (DALIDataType dali : {
+      DALI_BOOL,
+      DALI_FLOAT16, DALI_FLOAT, DALI_FLOAT64,
+      DALI_INT8, DALI_UINT8,
+      DALI_INT16, DALI_UINT16,
+      DALI_INT32, DALI_UINT32,
+      DALI_INT64, DALI_UINT64 }) {
+    dl = ToDLType(dali);
+    TypeInfo info = TypeTable::GetTypeInfo(dali);
+    EXPECT_EQ(dl.lanes, 1);
+    EXPECT_EQ(dl.bits, info.size() * 8);
+    if (info.name().find("uint") == 0) {
+      EXPECT_EQ(dl.code, kDLUInt);
+    } else if (info.name().find("int") == 0) {
+      EXPECT_EQ(dl.code, kDLInt);
+    } else if (info.name().find("float") == 0) {
+      EXPECT_EQ(dl.code, kDLFloat);
+    } else if (info.name().find("bool") == 0) {
+      EXPECT_EQ(dl.code, kDLBool);
+    }
+
+    EXPECT_EQ(ToDALIType(dl), dali) << "Conversion back to DALI type yielded a different type.";
+  }
+}
+
+TEST(DLPackTest, DLTypeToString) {
+  EXPECT_EQ(to_string(DLDataType{ kDLBool, 8, 1 }), "b8");
+  EXPECT_EQ(to_string(DLDataType{ kDLBfloat, 16, 1 }), "bf16");
+  EXPECT_EQ(to_string(DLDataType{ kDLFloat, 32, 4 }), "f32x4");
+  EXPECT_EQ(to_string(DLDataType{ kDLUInt, 16, 2 }), "u16x2");
+  EXPECT_EQ(to_string(DLDataType{ kDLInt, 64, 1 }), "i64");
+  EXPECT_EQ(to_string(DLDataType{ 123, 8, 16 }), "<unknown:123>8x16");
+}
+
+namespace {
+
+void TestSampleViewCPU(bool pinned) {
   Tensor<CPUBackend> tensor;
+  tensor.set_pinned(pinned);
+  tensor.set_device_id(0);
   tensor.Resize({100, 50, 3}, DALI_FLOAT);
   SampleView<CPUBackend> sv{tensor.raw_mutable_data(), tensor.shape(), tensor.type()};
-  DLMTensorPtr dlm_tensor = GetDLTensorView(sv, tensor.device_id());
-  ASSERT_EQ(dlm_tensor->dl_tensor.ndim, 3);
-  ASSERT_EQ(dlm_tensor->dl_tensor.shape[0], 100);
-  ASSERT_EQ(dlm_tensor->dl_tensor.shape[1], 50);
-  ASSERT_EQ(dlm_tensor->dl_tensor.shape[2], 3);
-  ASSERT_EQ(dlm_tensor->dl_tensor.data, sv.raw_data());
-  ASSERT_EQ(dlm_tensor->dl_tensor.dtype.code, kDLFloat);
-  ASSERT_EQ(dlm_tensor->dl_tensor.dtype.bits, sizeof(float) * 8);
-  ASSERT_EQ(dlm_tensor->dl_tensor.device.device_type, kDLCPU);
-  ASSERT_EQ(dlm_tensor->dl_tensor.byte_offset, 0);
+  DLMTensorPtr dlm_tensor = GetDLTensorView(sv, tensor.is_pinned(), tensor.device_id());
+  EXPECT_EQ(dlm_tensor->dl_tensor.ndim, 3);
+  EXPECT_EQ(dlm_tensor->dl_tensor.shape[0], 100);
+  EXPECT_EQ(dlm_tensor->dl_tensor.shape[1], 50);
+  EXPECT_EQ(dlm_tensor->dl_tensor.shape[2], 3);
+  EXPECT_EQ(dlm_tensor->dl_tensor.data, sv.raw_data());
+  EXPECT_EQ(dlm_tensor->dl_tensor.dtype.code, kDLFloat);
+  EXPECT_EQ(dlm_tensor->dl_tensor.dtype.bits, sizeof(float) * 8);
+  EXPECT_EQ(dlm_tensor->dl_tensor.device.device_type, pinned ? kDLCUDAHost : kDLCPU);
+  EXPECT_EQ(dlm_tensor->dl_tensor.byte_offset, 0);
+}
+
+}  // namespace
+
+TEST(DLMTensorPtr, ViewCPU) {
+  TestSampleViewCPU(false);
+}
+
+TEST(DLMTensorPtr, ViewPinnedCPU) {
+  TestSampleViewCPU(true);
+}
+
+TEST(DLMTensorPtr, CPUShared) {
+  Tensor<CPUBackend> tensor;
+  tensor.set_pinned(false);
+  tensor.set_device_id(0);
+  tensor.Resize({100, 50, 3}, DALI_FLOAT);
+  {
+    DLMTensorPtr dlm_tensor = GetSharedDLTensor(tensor);
+    EXPECT_EQ(tensor.get_data_ptr().use_count(), 2) << "Reference count not increased";
+    EXPECT_EQ(dlm_tensor->dl_tensor.ndim, 3);
+    EXPECT_EQ(dlm_tensor->dl_tensor.shape[0], 100);
+    EXPECT_EQ(dlm_tensor->dl_tensor.shape[1], 50);
+    EXPECT_EQ(dlm_tensor->dl_tensor.shape[2], 3);
+    EXPECT_EQ(dlm_tensor->dl_tensor.data, tensor.raw_data());
+    EXPECT_EQ(dlm_tensor->dl_tensor.dtype.code, kDLFloat);
+    EXPECT_EQ(dlm_tensor->dl_tensor.dtype.bits, sizeof(float) * 8);
+    EXPECT_EQ(dlm_tensor->dl_tensor.device.device_type, kDLCPU);
+    EXPECT_EQ(dlm_tensor->dl_tensor.byte_offset, 0);
+  }
+  EXPECT_EQ(tensor.get_data_ptr().use_count(), 1) << "Reference leaked.";
 }
 
-TEST(DLMTensorPtr, GPU) {
+TEST(DLMTensorPtr, ViewGPU) {
   Tensor<GPUBackend> tensor;
   tensor.Resize({100, 50, 1}, DALI_INT32);
   SampleView<GPUBackend> sv{tensor.raw_mutable_data(), tensor.shape(), tensor.type()};
-  DLMTensorPtr dlm_tensor = GetDLTensorView(sv, tensor.device_id());
-  ASSERT_EQ(dlm_tensor->dl_tensor.ndim, 3);
-  ASSERT_EQ(dlm_tensor->dl_tensor.shape[0], 100);
-  ASSERT_EQ(dlm_tensor->dl_tensor.shape[1], 50);
-  ASSERT_EQ(dlm_tensor->dl_tensor.shape[2], 1);
-  ASSERT_EQ(dlm_tensor->dl_tensor.data, sv.raw_data());
-  ASSERT_EQ(dlm_tensor->dl_tensor.dtype.code, kDLInt);
-  ASSERT_EQ(dlm_tensor->dl_tensor.dtype.bits, sizeof(int) * 8);
-  ASSERT_EQ(dlm_tensor->dl_tensor.device.device_type, kDLCUDA);
-  ASSERT_EQ(dlm_tensor->dl_tensor.device.device_id, tensor.device_id());
-  ASSERT_EQ(dlm_tensor->dl_tensor.byte_offset, 0);
+  DLMTensorPtr dlm_tensor = GetDLTensorView(sv, false, tensor.device_id());
+  EXPECT_EQ(dlm_tensor->dl_tensor.ndim, 3);
+  EXPECT_EQ(dlm_tensor->dl_tensor.shape[0], 100);
+  EXPECT_EQ(dlm_tensor->dl_tensor.shape[1], 50);
+  EXPECT_EQ(dlm_tensor->dl_tensor.shape[2], 1);
+  EXPECT_EQ(dlm_tensor->dl_tensor.data, sv.raw_data());
+  EXPECT_EQ(dlm_tensor->dl_tensor.dtype.code, kDLInt);
+  EXPECT_EQ(dlm_tensor->dl_tensor.dtype.bits, sizeof(int) * 8);
+  EXPECT_EQ(dlm_tensor->dl_tensor.device.device_type, kDLCUDA);
+  EXPECT_EQ(dlm_tensor->dl_tensor.device.device_id, tensor.device_id());
+  EXPECT_EQ(dlm_tensor->dl_tensor.byte_offset, 0);
 }
 
 TEST(DLMTensorPtr, CPUList) {
   TensorList<CPUBackend> tlist;
+  tlist.set_pinned(false);
   tlist.Resize({{100, 50, 1}, {50, 30, 3}}, DALI_FLOAT64);
   std::vector<DLMTensorPtr> dlm_tensors = GetDLTensorListView(tlist);
-  ASSERT_EQ(dlm_tensors[0]->dl_tensor.ndim, 3);
-  ASSERT_EQ(dlm_tensors[0]->dl_tensor.shape[0], 100);
-  ASSERT_EQ(dlm_tensors[0]->dl_tensor.shape[1], 50);
-  ASSERT_EQ(dlm_tensors[0]->dl_tensor.shape[2], 1);
-  ASSERT_EQ(dlm_tensors[0]->dl_tensor.data, tlist.raw_tensor(0));
-  ASSERT_EQ(dlm_tensors[0]->dl_tensor.dtype.code, kDLFloat);
-  ASSERT_EQ(dlm_tensors[0]->dl_tensor.dtype.bits, sizeof(double) * 8);
-  ASSERT_EQ(dlm_tensors[0]->dl_tensor.device.device_type, kDLCPU);
-  ASSERT_EQ(dlm_tensors[0]->dl_tensor.byte_offset, 0);
-
-  ASSERT_EQ(tlist.tensor_shape(1).size(), 3);
-  ASSERT_EQ(dlm_tensors[1]->dl_tensor.ndim, 3);
-  ASSERT_EQ(dlm_tensors[1]->dl_tensor.shape[0], 50);
-  ASSERT_EQ(dlm_tensors[1]->dl_tensor.shape[1], 30);
-  ASSERT_EQ(dlm_tensors[1]->dl_tensor.shape[2], 3);
-  ASSERT_EQ(dlm_tensors[1]->dl_tensor.data, tlist.raw_tensor(1));
-  ASSERT_EQ(dlm_tensors[1]->dl_tensor.dtype.code, kDLFloat);
-  ASSERT_EQ(dlm_tensors[1]->dl_tensor.dtype.bits, sizeof(double) * 8);
-  ASSERT_EQ(dlm_tensors[1]->dl_tensor.device.device_type, kDLCPU);
-  ASSERT_EQ(dlm_tensors[1]->dl_tensor.byte_offset, 0);
+  EXPECT_EQ(dlm_tensors[0]->dl_tensor.ndim, 3);
+  EXPECT_EQ(dlm_tensors[0]->dl_tensor.shape[0], 100);
+  EXPECT_EQ(dlm_tensors[0]->dl_tensor.shape[1], 50);
+  EXPECT_EQ(dlm_tensors[0]->dl_tensor.shape[2], 1);
+  EXPECT_EQ(dlm_tensors[0]->dl_tensor.data, tlist.raw_tensor(0));
+  EXPECT_EQ(dlm_tensors[0]->dl_tensor.dtype.code, kDLFloat);
+  EXPECT_EQ(dlm_tensors[0]->dl_tensor.dtype.bits, sizeof(double) * 8);
+  EXPECT_EQ(dlm_tensors[0]->dl_tensor.device.device_type, kDLCPU);
+  EXPECT_EQ(dlm_tensors[0]->dl_tensor.byte_offset, 0);
+
+  EXPECT_EQ(tlist.tensor_shape(1).size(), 3);
+  EXPECT_EQ(dlm_tensors[1]->dl_tensor.ndim, 3);
+  EXPECT_EQ(dlm_tensors[1]->dl_tensor.shape[0], 50);
+  EXPECT_EQ(dlm_tensors[1]->dl_tensor.shape[1], 30);
+  EXPECT_EQ(dlm_tensors[1]->dl_tensor.shape[2], 3);
+  EXPECT_EQ(dlm_tensors[1]->dl_tensor.data, tlist.raw_tensor(1));
+  EXPECT_EQ(dlm_tensors[1]->dl_tensor.dtype.code, kDLFloat);
+  EXPECT_EQ(dlm_tensors[1]->dl_tensor.dtype.bits, sizeof(double) * 8);
+  EXPECT_EQ(dlm_tensors[1]->dl_tensor.device.device_type, kDLCPU);
+  EXPECT_EQ(dlm_tensors[1]->dl_tensor.byte_offset, 0);
+}
+
+
+TEST(DLMTensorPtr, CPUSharedList) {
+  TensorList<CPUBackend> tlist;
+  tlist.set_pinned(false);
+  tlist.Resize({{100, 50, 1}, {50, 30, 3}}, DALI_FLOAT64);
+  const auto &ptr = unsafe_owner(tlist);
+  EXPECT_EQ(ptr.use_count(), 3);
+  std::vector<DLMTensorPtr> dlm_tensors = GetSharedDLTensorList(tlist);
+  EXPECT_EQ(ptr.use_count(), 5);
+  EXPECT_EQ(dlm_tensors[0]->dl_tensor.ndim, 3);
+  EXPECT_EQ(dlm_tensors[0]->dl_tensor.shape[0], 100);
+  EXPECT_EQ(dlm_tensors[0]->dl_tensor.shape[1], 50);
+  EXPECT_EQ(dlm_tensors[0]->dl_tensor.shape[2], 1);
+  EXPECT_EQ(dlm_tensors[0]->dl_tensor.data, tlist.raw_tensor(0));
+  EXPECT_EQ(dlm_tensors[0]->dl_tensor.dtype.code, kDLFloat);
+  EXPECT_EQ(dlm_tensors[0]->dl_tensor.dtype.bits, sizeof(double) * 8);
+  EXPECT_EQ(dlm_tensors[0]->dl_tensor.device.device_type, kDLCPU);
+  EXPECT_EQ(dlm_tensors[0]->dl_tensor.byte_offset, 0);
+
+  EXPECT_EQ(tlist.tensor_shape(1).size(), 3);
+  EXPECT_EQ(dlm_tensors[1]->dl_tensor.ndim, 3);
+  EXPECT_EQ(dlm_tensors[1]->dl_tensor.shape[0], 50);
+  EXPECT_EQ(dlm_tensors[1]->dl_tensor.shape[1], 30);
+  EXPECT_EQ(dlm_tensors[1]->dl_tensor.shape[2], 3);
+  EXPECT_EQ(dlm_tensors[1]->dl_tensor.data, tlist.raw_tensor(1));
+  EXPECT_EQ(dlm_tensors[1]->dl_tensor.dtype.code, kDLFloat);
+  EXPECT_EQ(dlm_tensors[1]->dl_tensor.dtype.bits, sizeof(double) * 8);
+  EXPECT_EQ(dlm_tensors[1]->dl_tensor.device.device_type, kDLCPU);
+  EXPECT_EQ(dlm_tensors[1]->dl_tensor.byte_offset, 0);
+  dlm_tensors.clear();
+  EXPECT_EQ(ptr.use_count(), 3);
 }
 
 TEST(DLMTensorPtr, GPUList) {
   TensorList<GPUBackend> tlist;
   tlist.Resize({{100, 50, 1}, {50, 30, 3}}, DALI_UINT8);
   std::vector<DLMTensorPtr> dlm_tensors = GetDLTensorListView(tlist);
-  ASSERT_EQ(dlm_tensors[0]->dl_tensor.ndim, 3);
-  ASSERT_EQ(dlm_tensors[0]->dl_tensor.shape[0], 100);
-  ASSERT_EQ(dlm_tensors[0]->dl_tensor.shape[1], 50);
-  ASSERT_EQ(dlm_tensors[0]->dl_tensor.shape[2], 1);
-  ASSERT_EQ(dlm_tensors[0]->dl_tensor.data, tlist.raw_tensor(0));
-  ASSERT_EQ(dlm_tensors[0]->dl_tensor.dtype.code, kDLUInt);
-  ASSERT_EQ(dlm_tensors[0]->dl_tensor.dtype.bits, sizeof(uint8_t) * 8);
-  ASSERT_EQ(dlm_tensors[0]->dl_tensor.device.device_type, kDLCUDA);
-  ASSERT_EQ(dlm_tensors[0]->dl_tensor.byte_offset, 0);
-  ASSERT_EQ(dlm_tensors[0]->dl_tensor.device.device_id, tlist.device_id());
-
-  ASSERT_EQ(dlm_tensors[1]->dl_tensor.ndim, 3);
-  ASSERT_EQ(dlm_tensors[1]->dl_tensor.shape[0], 50);
-  ASSERT_EQ(dlm_tensors[1]->dl_tensor.shape[1], 30);
-  ASSERT_EQ(dlm_tensors[1]->dl_tensor.shape[2], 3);
-  ASSERT_EQ(dlm_tensors[1]->dl_tensor.data, tlist.raw_tensor(1));
-  ASSERT_EQ(dlm_tensors[1]->dl_tensor.dtype.code, kDLUInt);
-  ASSERT_EQ(dlm_tensors[1]->dl_tensor.dtype.bits, sizeof(uint8_t) * 8);
-  ASSERT_EQ(dlm_tensors[1]->dl_tensor.device.device_type, kDLCUDA);
-  ASSERT_EQ(dlm_tensors[1]->dl_tensor.byte_offset, 0);
-  ASSERT_EQ(dlm_tensors[1]->dl_tensor.device.device_id, tlist.device_id());
-}
+  EXPECT_EQ(dlm_tensors[0]->dl_tensor.ndim, 3);
+  EXPECT_EQ(dlm_tensors[0]->dl_tensor.shape[0], 100);
+  EXPECT_EQ(dlm_tensors[0]->dl_tensor.shape[1], 50);
+  EXPECT_EQ(dlm_tensors[0]->dl_tensor.shape[2], 1);
+  EXPECT_EQ(dlm_tensors[0]->dl_tensor.data, tlist.raw_tensor(0));
+  EXPECT_EQ(dlm_tensors[0]->dl_tensor.dtype.code, kDLUInt);
+  EXPECT_EQ(dlm_tensors[0]->dl_tensor.dtype.bits, sizeof(uint8_t) * 8);
+  EXPECT_EQ(dlm_tensors[0]->dl_tensor.device.device_type, kDLCUDA);
+  EXPECT_EQ(dlm_tensors[0]->dl_tensor.byte_offset, 0);
+  EXPECT_EQ(dlm_tensors[0]->dl_tensor.device.device_id, tlist.device_id());
 
-struct TestDLTensorResource: public DLTensorResource {
-  TestDLTensorResource(TensorShape<> shape, bool &called)
-  : DLTensorResource(std::move(shape))
-  , called(called) {
-    called = false;
-  }
+  EXPECT_EQ(dlm_tensors[1]->dl_tensor.ndim, 3);
+  EXPECT_EQ(dlm_tensors[1]->dl_tensor.shape[0], 50);
+  EXPECT_EQ(dlm_tensors[1]->dl_tensor.shape[1], 30);
+  EXPECT_EQ(dlm_tensors[1]->dl_tensor.shape[2], 3);
+  EXPECT_EQ(dlm_tensors[1]->dl_tensor.data, tlist.raw_tensor(1));
+  EXPECT_EQ(dlm_tensors[1]->dl_tensor.dtype.code, kDLUInt);
+  EXPECT_EQ(dlm_tensors[1]->dl_tensor.dtype.bits, sizeof(uint8_t) * 8);
+  EXPECT_EQ(dlm_tensors[1]->dl_tensor.device.device_type, kDLCUDA);
+  EXPECT_EQ(dlm_tensors[1]->dl_tensor.byte_offset, 0);
+  EXPECT_EQ(dlm_tensors[1]->dl_tensor.device.device_id, tlist.device_id());
+}
 
-  bool &called;
+struct TestDLPayload {
+  explicit TestDLPayload(bool &destroyed)
+  : destroyed(destroyed) {}
 
-  ~TestDLTensorResource() override {
-    called = true;
+  ~TestDLPayload() {
+    destroyed = true;
   }
+
+  bool &destroyed;
 };
 
+
 TEST(DLMTensorPtr, Cleanup) {
-  Tensor<CPUBackend> tensor;
-  tensor.Resize({100, 50, 3}, DALI_FLOAT);
   bool deleter_called = false;
   {
-    auto dlm_tensor = MakeDLTensor(tensor.raw_mutable_data(),
-                                   tensor.type(),
-                                   false, -1,
-                                   std::make_unique<TestDLTensorResource>(tensor.shape(),
-                                                                          deleter_called));
+    auto rsrc = DLTensorResource<TestDLPayload>::Create(deleter_called);
+    auto dlm_tensor = ToDLMTensor(std::move(rsrc));
+    EXPECT_EQ(rsrc, nullptr);
   }
-  ASSERT_TRUE(deleter_called);
+  EXPECT_TRUE(deleter_called);
 }
 
 }  // namespace dali
diff --git a/dali/pipeline/util/copy_with_stride_test.cc b/dali/pipeline/util/copy_with_stride_test.cc
index 27e6ccd558..95ca5fa993 100644
--- a/dali/pipeline/util/copy_with_stride_test.cc
+++ b/dali/pipeline/util/copy_with_stride_test.cc
@@ -1,4 +1,4 @@
-// Copyright (c) 2019-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// Copyright (c) 2019-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -30,10 +30,7 @@ TEST(CopyWithStrideTest, OneDim) {
   constexpr int vol = 3;
   ASSERT_EQ(vol, volume(shape));
   std::array<T, vol> out;
-  DLTensorResource resource(shape);
-  resource.strides = stride;
-  auto dl_tensor =
-      MakeDLTensor(data, dtype, false, -1, std::make_unique<DLTensorResource>(resource));
+  auto dl_tensor = MakeDLTensor(data, dtype, false, false, -1, shape, stride);
   CopyDlTensorCpu(out.data(), dl_tensor);
   ASSERT_TRUE((out == std::array<T, vol>{1, 3, 5}));
 }
@@ -50,10 +47,7 @@ TEST(CopyWithStrideTest, TwoDims)  {
   constexpr int vol = 8;
   ASSERT_EQ(vol, volume(shape));
   std::array<T, vol> out;
-  DLTensorResource resource(shape);
-  resource.strides = stride;
-  auto dl_tensor =
-      MakeDLTensor(data, dtype, false, -1, std::make_unique<DLTensorResource>(resource));
+  auto dl_tensor = MakeDLTensor(data, dtype, false, false, -1, shape, stride);
   CopyDlTensorCpu(out.data(), dl_tensor);
   ASSERT_TRUE((out == std::array<T, vol>{11, 12, 13, 14,
                                          31, 32, 33, 34}));
@@ -72,10 +66,7 @@ TEST(CopyWithStrideTest, SimpleCopy) {
   constexpr int vol = 8;
   ASSERT_EQ(vol, volume(shape));
   std::array<T, vol> out;
-  DLTensorResource resource(shape);
-  resource.strides = stride;
-  auto dl_tensor =
-      MakeDLTensor(data, dtype, false, -1, std::make_unique<DLTensorResource>(resource));
+  auto dl_tensor = MakeDLTensor(data, dtype, false, false, -1, shape, stride);
   CopyDlTensorCpu(out.data(), dl_tensor);
   ASSERT_TRUE((out == std::array<T, vol>{1, 2,
                                          3, 4,
@@ -85,9 +76,7 @@ TEST(CopyWithStrideTest, SimpleCopy) {
 }
 
 DLMTensorPtr AsDlTensor(void* data, DALIDataType dtype, TensorShape<> shape, TensorShape<> stride) {
-  DLTensorResource resource(shape);
-  resource.strides = stride;
-  return MakeDLTensor(data, dtype, true, 0, std::make_unique<DLTensorResource>(resource));
+  return MakeDLTensor(data, dtype, false, false, -1, shape, stride);
 }
 
 std::vector<DLMTensorPtr> DlTensorSingletonBatch(DLMTensorPtr dl_tensor) {
diff --git a/dali/python/backend_impl.cc b/dali/python/backend_impl.cc
index b9f43e8004..129260cebf 100644
--- a/dali/python/backend_impl.cc
+++ b/dali/python/backend_impl.cc
@@ -200,7 +200,7 @@ void FillTensorFromDlPack(py::capsule capsule, SourceDataType<SrcBackend> *batch
                   dl_tensor.device.device_type == kDLCPU),
                "DLPack device type doesn't match Tensor type");
 
-  const TypeInfo &dali_type = TypeTable::GetTypeInfo(DLToDALIType(dl_tensor.dtype));
+  const TypeInfo &dali_type = TypeTable::GetTypeInfo(ToDALIType(dl_tensor.dtype));
   TensorShape<> shape;
   shape.resize(dl_tensor.ndim);
   for (ssize_t i = 0; i < dl_tensor.ndim; ++i) {
@@ -497,7 +497,7 @@ void ExposeTensor(py::module &m) {
       [](Tensor<CPUBackend> &t) -> py::capsule {
         SampleView<CPUBackend> sv{t.raw_mutable_data(), t.shape(), t.type()};
 
-        return TensorToDLPackView(sv, t.device_id());
+        return TensorToDLPackView(sv, t.is_pinned(), t.device_id());
       },
       R"code(
       Exposes tensor data as DLPack compatible capsule.
@@ -692,7 +692,7 @@ void ExposeTensor(py::module &m) {
       [](Tensor<GPUBackend> &t) -> py::capsule {
         SampleView<GPUBackend> sv{t.raw_mutable_data(), t.shape(), t.type()};
 
-        return TensorToDLPackView(sv, t.device_id());
+        return TensorToDLPackView(sv, t.is_pinned(), t.device_id());
       },
       R"code(
       Exposes tensor data as DLPack compatible capsule.
diff --git a/dali/util/pybind.h b/dali/util/pybind.h
index c4ec541edd..9073688517 100644
--- a/dali/util/pybind.h
+++ b/dali/util/pybind.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2019-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// Copyright (c) 2019-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -217,8 +217,8 @@ static py::capsule DLTensorToCapsule(DLMTensorPtr dl_tensor) {
 }
 
 template <typename Backend>
-py::capsule TensorToDLPackView(SampleView<Backend> tensor, int device_id) {
-  DLMTensorPtr dl_tensor = GetDLTensorView(tensor, device_id);
+py::capsule TensorToDLPackView(SampleView<Backend> tensor, bool pinned, int device_id) {
+  DLMTensorPtr dl_tensor = GetDLTensorView(tensor, pinned, device_id);
   return DLTensorToCapsule(std::move(dl_tensor));
 }
 

From f09a8f3a2e6eec6bc3afadd5fbb0e9f0f48a15bb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Micha=C5=82=20Zientkiewicz?= <michalz@nvidia.com>
Date: Thu, 10 Oct 2024 07:49:12 +0200
Subject: [PATCH 13/29] Bump numpy version in Xavier tests. (#5663)

Signed-off-by: Michal Zientkiewicz <michalz@nvidia.com>
---
 qa/TL0_python-self-test_xavier/test_nofw.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/qa/TL0_python-self-test_xavier/test_nofw.sh b/qa/TL0_python-self-test_xavier/test_nofw.sh
index 80f01cfe63..1297f5e26d 100755
--- a/qa/TL0_python-self-test_xavier/test_nofw.sh
+++ b/qa/TL0_python-self-test_xavier/test_nofw.sh
@@ -4,7 +4,7 @@
 if [ -z "$gather_pip_packages" ]
 then
   # due to https://github.com/numpy/numpy/issues/18131 we cannot use 1.19.5
-  pip_packages='${python_test_runner_package} dataclasses numpy>=1.20 opencv-python pillow psutil astropy'
+  pip_packages='${python_test_runner_package} dataclasses numpy>=1.23 opencv-python pillow psutil astropy'
 fi
 
 target_dir=./dali/test/python

From 94b563aacf4c1daef003505585b8cb6b3f01b42a Mon Sep 17 00:00:00 2001
From: Janusz Lisiecki <39967756+JanuszL@users.noreply.github.com>
Date: Thu, 10 Oct 2024 09:07:55 +0200
Subject: [PATCH 14/29] Add support for bool type for the numba operator
 (#5666)

- adds support for bool types inside the numba operator
- adds a test for that for CPU and GPU

Signed-off-by: Janusz Lisiecki <jlisiecki@nvidia.com>
---
 .../plugin/numba/experimental/__init__.py     |  2 +
 .../test/python/operator_1/test_numba_func.py | 50 ++++++++++++++-----
 2 files changed, 40 insertions(+), 12 deletions(-)

diff --git a/dali/python/nvidia/dali/plugin/numba/experimental/__init__.py b/dali/python/nvidia/dali/plugin/numba/experimental/__init__.py
index 742cdc057e..6e7a4b7733 100644
--- a/dali/python/nvidia/dali/plugin/numba/experimental/__init__.py
+++ b/dali/python/nvidia/dali/plugin/numba/experimental/__init__.py
@@ -25,6 +25,7 @@
 
 
 _to_numpy = {
+    dali_types.BOOL: "bool_",
     dali_types.UINT8: "uint8",
     dali_types.UINT16: "uint16",
     dali_types.UINT32: "uint32",
@@ -39,6 +40,7 @@
 }
 
 _to_numba = {
+    dali_types.BOOL: numba_types.boolean,
     dali_types.UINT8: numba_types.uint8,
     dali_types.UINT16: numba_types.uint16,
     dali_types.UINT32: numba_types.uint32,
diff --git a/dali/test/python/operator_1/test_numba_func.py b/dali/test/python/operator_1/test_numba_func.py
index ea966b3da1..5d127a68ed 100644
--- a/dali/test/python/operator_1/test_numba_func.py
+++ b/dali/test/python/operator_1/test_numba_func.py
@@ -34,6 +34,10 @@
 lmdb_folder = os.path.join(test_data_root, "db", "lmdb")
 
 
+def set_all_values_to_1_batch(out0, in0):
+    out0[0][:] = 1
+
+
 def set_all_values_to_255_batch(out0, in0):
     out0[0][:] = 255
 
@@ -42,14 +46,18 @@ def set_all_values_to_255_sample(out0, in0):
     out0[:] = 255
 
 
+def set_all_values_to_1_sample_gpu(out0, in0):
+    tx, ty, tz = cuda.grid(3)
+    x_s, y_s, z_s = cuda.gridsize(3)
+
+    out0[tz::z_s, ty::y_s, tx::x_s] = 1
+
+
 def set_all_values_to_255_sample_gpu(out0, in0):
     tx, ty, tz = cuda.grid(3)
     x_s, y_s, z_s = cuda.gridsize(3)
 
-    for z in range(tz, out0.shape[0], z_s):
-        for y in range(ty, out0.shape[1], y_s):
-            for x in range(tx, out0.shape[2], x_s):
-                out0[z][y][x] = 255
+    out0[tz::z_s, ty::y_s, tx::x_s] = 255
 
 
 def set_all_values_to_float_batch(out0, in0):
@@ -64,10 +72,7 @@ def set_all_values_to_float_sample_gpu(out0, in0):
     tx, ty, tz = cuda.grid(3)
     x_s, y_s, z_s = cuda.gridsize(3)
 
-    for z in range(tz, out0.shape[0], z_s):
-        for y in range(ty, out0.shape[1], y_s):
-            for x in range(tx, out0.shape[2], x_s):
-                out0[z][y][x] = 0.5
+    out0[tz::z_s, ty::y_s, tx::x_s] = 0.5
 
 
 def setup_change_out_shape(out_shape, in_shape):
@@ -92,10 +97,7 @@ def change_out_shape_sample_gpu(out0, in0):
     tx, ty, tz = cuda.grid(3)
     x_s, y_s, z_s = cuda.gridsize(3)
 
-    for z in range(tz, out0.shape[0], z_s):
-        for y in range(ty, out0.shape[1], y_s):
-            for x in range(tx, out0.shape[2], x_s):
-                out0[z][y][x] = 42
+    out0[tz::z_s, ty::y_s, tx::x_s] = 42
 
 
 # in shape [x] -> out shape [2, 2, 2, x]
@@ -208,6 +210,18 @@ def test_numba_func():
     # in_types, out_ndim, in_ndim, setup_fn, batch_processing,
     # expected_out
     args = [
+        (
+            [(10, 10, 10)],
+            np.bool_,
+            set_all_values_to_1_batch,
+            [dali_types.BOOL],
+            [dali_types.BOOL],
+            [3],
+            [3],
+            None,
+            True,
+            [np.full((10, 10, 10), 1, dtype=np.bool_)],
+        ),
         (
             [(10, 10, 10)],
             np.uint8,
@@ -359,6 +373,18 @@ def test_numba_func_gpu():
     # in_types, out_ndim, in_ndim, setup_fn, batch_processing,
     # expected_out
     args = [
+        (
+            [(10, 10, 10)],
+            np.bool_,
+            set_all_values_to_1_sample_gpu,
+            [dali_types.BOOL],
+            [dali_types.BOOL],
+            [3],
+            [3],
+            None,
+            None,
+            [np.full((10, 10, 10), 1, dtype=np.bool_)],
+        ),
         (
             [(10, 10, 10)],
             np.uint8,

From 15afbdd765af0978b62fc9f2372bf84b26594cff Mon Sep 17 00:00:00 2001
From: Kamil Tokarski <ktokarski@nvidia.com>
Date: Mon, 14 Oct 2024 03:01:24 -0700
Subject: [PATCH 15/29] Update VERSION to 1.44.0dev

Signed-off-by: Kamil Tokarski <ktokarski@nvidia.com>
---
 VERSION | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/VERSION b/VERSION
index 4df74a9747..74ad758f16 100644
--- a/VERSION
+++ b/VERSION
@@ -1 +1 @@
-1.43.0dev
+1.44.0dev

From 6695f6c4e0fd1343e8c2e57c24871286afdc9b76 Mon Sep 17 00:00:00 2001
From: Janusz Lisiecki <39967756+JanuszL@users.noreply.github.com>
Date: Mon, 14 Oct 2024 12:35:51 +0200
Subject: [PATCH 16/29] Add an ability to retry rewind to the one before the
 last keyframe (#5669)

- sometimes the rewind to one before the last keyframe leads
  to rewinding to this keyframe. With this change when it happens
  DALI tries to iteratively rewind to one before the last, two
  before the last, and so on...

Signed-off-by: Janusz Lisiecki <jlisiecki@nvidia.com>
---
 dali/operators/reader/loader/video_loader.cc | 22 ++++++++++++++++----
 1 file changed, 18 insertions(+), 4 deletions(-)

diff --git a/dali/operators/reader/loader/video_loader.cc b/dali/operators/reader/loader/video_loader.cc
index 292dfaa402..a9d280fb75 100644
--- a/dali/operators/reader/loader/video_loader.cc
+++ b/dali/operators/reader/loader/video_loader.cc
@@ -564,6 +564,8 @@ void VideoLoader::read_file() {
 
   bool is_first_frame = true;
   int last_key_frame = -1;
+  int previous_last_key_frame = -1;
+  bool previous_last_key_frame_updated = false;
   bool key = false;
   bool seek_must_succeed = false;
   // how many key frames following the last requested frames we saw so far
@@ -607,11 +609,19 @@ void VideoLoader::read_file() {
                     "already 0");
         }
 
+      if (previous_last_key_frame < 0) {
+        previous_last_key_frame = last_key_frame - 1;
+        previous_last_key_frame_updated = true;
+      }
+      if (previous_last_key_frame_updated == false) {
+        --previous_last_key_frame;
+      }
       LOG_LINE << "Decoding not started, seek to preceding key frame, "
-                << "current frame " << frame
-                << ", last key frame " << last_key_frame
-                << ", is_key " << key << std::endl;
-      seek(file, last_key_frame - 1);
+               << "current frame " << frame
+               << ", look for a key frame before " << previous_last_key_frame
+               << ", is_key " << key << std::endl;
+      seek(file, previous_last_key_frame);
+      previous_last_key_frame_updated = false;
       frames_send = 0;
       last_key_frame = -1;
       continue;
@@ -619,6 +629,10 @@ void VideoLoader::read_file() {
 
     if (key) {
       last_key_frame = frame;
+      if (frame < previous_last_key_frame) {
+        previous_last_key_frame = frame;
+        previous_last_key_frame_updated = true;
+      }
     }
 
     int pkt_frames = 1;

From 131c99b1abd9b410d52ce85db47ff8690e2f5022 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Micha=C5=82=20Zientkiewicz?= <michalz@nvidia.com>
Date: Mon, 14 Oct 2024 14:38:27 +0200
Subject: [PATCH 17/29] Fix handling of tasks with zero outputs. (#5674)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Tasking:
* zero-output tasks have now special treatment
* a scalar void value is used when there are zero outputs
Exec2:
* Add more robust detection of sink operators.
* Add a unit test with sink operator.

--------

Signed-off-by: Michał Zientkiewicz <mzient@gmail.com>
---
 dali/core/exec/tasking_test.cc                | 46 +++++++++++++
 .../executor/executor2/exec2_ops_for_test.cu  |  7 ++
 .../executor/executor2/exec2_ops_for_test.h   | 36 +++++++++++
 .../pipeline/executor/executor2/exec2_test.cc | 21 ++++++
 dali/pipeline/executor/executor2/exec2_test.h | 15 +++++
 dali/pipeline/graph/op_graph2.cc              |  2 +-
 include/dali/core/exec/tasking/task.h         | 64 +++++++++++++------
 7 files changed, 172 insertions(+), 19 deletions(-)

diff --git a/dali/core/exec/tasking_test.cc b/dali/core/exec/tasking_test.cc
index 1a846cfc87..dc027b99b1 100644
--- a/dali/core/exec/tasking_test.cc
+++ b/dali/core/exec/tasking_test.cc
@@ -336,6 +336,52 @@ TEST(TaskingTest, MultiOutputTuple) {
   EXPECT_EQ(ret, 1 + 3 + 42 + 5 + 10);
 }
 
+TEST(TaskingTest, ZeroResults) {
+  Executor ex(4);
+  ex.Start();
+  auto producer1 = Task::Create(0, []() {
+    return std::tuple<>();
+  });
+  auto producer2 = Task::Create(0, []() {
+    return std::vector<std::any>();
+  });
+
+  auto consumer = Task::Create([]() { });
+  consumer->Succeed(producer1);
+  consumer->Succeed(producer2);
+
+  ex.AddSilentTask(producer1);
+  ex.AddSilentTask(producer2);
+  auto fut = ex.AddTask(consumer);
+  EXPECT_NO_THROW(fut.Value<void>());
+}
+
+TEST(TaskingTest, ZeroResultsThrow) {
+  Executor ex(4);
+  ex.Start();
+  auto producer1 = Task::Create(0, []() {
+    return std::tuple<>();
+  });
+  class MyError {};
+  auto producer2 = Task::Create(0, []() {
+    throw MyError();
+    return std::tuple<>();
+  });
+
+  auto consumer = Task::Create([](Task *t) {
+    t->GetInputValue<void>(0);
+    t->GetInputValue<void>(1);
+  });
+  consumer->Subscribe(producer1);
+  consumer->Subscribe(producer2);
+
+  ex.AddSilentTask(producer1);
+  ex.AddSilentTask(producer2);
+  auto fut = ex.AddTask(consumer);
+  EXPECT_THROW(fut.Value<void>(), MyError);
+}
+
+
 namespace {
 
 template <typename T>
diff --git a/dali/pipeline/executor/executor2/exec2_ops_for_test.cu b/dali/pipeline/executor/executor2/exec2_ops_for_test.cu
index 65a327f753..7b65a69a0d 100644
--- a/dali/pipeline/executor/executor2/exec2_ops_for_test.cu
+++ b/dali/pipeline/executor/executor2/exec2_ops_for_test.cu
@@ -37,6 +37,13 @@ DALI_SCHEMA(Exec2Counter)
 
 DALI_REGISTER_OPERATOR(Exec2Counter, exec2::test::CounterOp, CPU);
 
+DALI_SCHEMA(Exec2Sink)
+  .NumInput(0, 99)
+  .NumOutput(0)
+  .NoPrune();
+
+DALI_REGISTER_OPERATOR(Exec2Sink, exec2::test::SinkOp, CPU);
+
 namespace exec2 {
 namespace test {
 
diff --git a/dali/pipeline/executor/executor2/exec2_ops_for_test.h b/dali/pipeline/executor/executor2/exec2_ops_for_test.h
index e9d7a3b398..17e2c4b772 100644
--- a/dali/pipeline/executor/executor2/exec2_ops_for_test.h
+++ b/dali/pipeline/executor/executor2/exec2_ops_for_test.h
@@ -141,6 +141,42 @@ class CounterOp : public Operator<CPUBackend> {
   int counter = 0;
 };
 
+constexpr char kSinkOpName[] = "Exec2Sink";
+
+/** A non-prunable operator without outputs.
+ *
+ * This accumulates the sum of input values in a member variable.
+ */
+class SinkOp : public Operator<CPUBackend> {
+ public:
+  explicit SinkOp(const OpSpec &spec) : Operator<CPUBackend>(spec) {
+  }
+
+  bool SetupImpl(std::vector<OutputDesc> &outs, const Workspace &ws) override {
+    outs.clear();
+    return true;
+  }
+
+  void RunImpl(Workspace &ws) override {
+    for (int ii = 0; ii < ws.NumInput(); ii++) {
+      auto &input = ws.Input<CPUBackend>(ii);
+      int N = input.num_samples();
+      for (int i = 0; i < N; i++) {
+        const auto &sample = input[i];
+        const int *data = sample.data<int>();
+        int64_t vol = sample.shape().num_elements();
+        for (int64_t j = 0; j < vol; j++) {
+          acc += data[j];
+        }
+      }
+    }
+  }
+
+  bool CanInferOutputs() const override { return true; }
+
+  int64_t acc = 0;
+};
+
 }  // namespace test
 }  // namespace exec2
 }  // namespace dali
diff --git a/dali/pipeline/executor/executor2/exec2_test.cc b/dali/pipeline/executor/executor2/exec2_test.cc
index 8e5ebde8b9..753034c2e8 100644
--- a/dali/pipeline/executor/executor2/exec2_test.cc
+++ b/dali/pipeline/executor/executor2/exec2_test.cc
@@ -125,6 +125,27 @@ TEST_P(Exec2Test, Graph2_CPU2GPU) {
   }
 }
 
+TEST_P(Exec2Test, Graph3_SinkOnly) {
+  Executor2 exec(config_);
+  graph::OpGraph graph = GetTestGraph3();
+  exec.Build(graph);
+  for (int i = 0; i < 10; i++) {
+    exec.Run();
+  }
+  Workspace ws;
+  int64_t acc = 0;
+  int bs = config_.max_batch_size;
+  for (int i = 0; i < 10; i++) {
+    ws.Clear();
+    exec.Outputs(&ws);
+    auto *sink = dynamic_cast<SinkOp*>(exec.GetOperator("op1"));
+    ASSERT_NE(sink, nullptr);
+    int64_t batch_sum = bs * (bs - 1) / 2;
+    acc += batch_sum;
+    EXPECT_EQ(sink->acc, acc);
+  }
+}
+
 
 Executor2::Config MakeCfg(QueueDepthPolicy q, OperatorConcurrency c, StreamPolicy s) {
   Executor2::Config cfg;
diff --git a/dali/pipeline/executor/executor2/exec2_test.h b/dali/pipeline/executor/executor2/exec2_test.h
index f60e288893..ac3f462167 100644
--- a/dali/pipeline/executor/executor2/exec2_test.h
+++ b/dali/pipeline/executor/executor2/exec2_test.h
@@ -154,6 +154,21 @@ inline void CheckTestGraph2Results(const Workspace &ws, int batch_size) {
   }
 }
 
+inline auto GetTestGraph3() {
+  auto spec0 = OpSpec(kTestOpName)
+    .AddArg("name", "op0")
+    .AddOutput("op0_0", "cpu")
+    .AddArg("addend", 0);
+  auto spec1 = OpSpec(kSinkOpName)
+    .AddArg("name", "op1")
+    .AddInput("op0_0", "cpu");
+
+  graph::OpGraph::Builder b;
+  b.Add("op0",  std::move(AddCommonArgs(spec0,  32, "cpu", 1)));
+  b.Add("op1",  std::move(AddCommonArgs(spec1,  32, "cpu", 1)));
+  return std::move(b).GetGraph(true);
+}
+
 }  // namespace test
 }  // namespace exec2
 }  // namespace dali
diff --git a/dali/pipeline/graph/op_graph2.cc b/dali/pipeline/graph/op_graph2.cc
index 40e2dcf919..dff1e801d8 100644
--- a/dali/pipeline/graph/op_graph2.cc
+++ b/dali/pipeline/graph/op_graph2.cc
@@ -30,7 +30,7 @@ OpNode &OpGraph::AddOp(std::string instance_name, OpSpec spec) {
   std::string device;
   if (spec.TryGetArgument(device, "device"))
     type = ParseOpType(device);
-  bool preserve = spec.GetArgument<bool>("preserve");
+  bool preserve = spec.GetArgument<bool>("preserve") || spec.GetSchemaOrDefault().IsNoPrune();
   auto &op_node = tmp.emplace_back(std::move(instance_name), type, std::move(spec));
   if (!name2op_.emplace(op_node.instance_name, &op_node).second) {
     throw std::invalid_argument(
diff --git a/include/dali/core/exec/tasking/task.h b/include/dali/core/exec/tasking/task.h
index ce4b8cf18a..71779d2bc0 100644
--- a/include/dali/core/exec/tasking/task.h
+++ b/include/dali/core/exec/tasking/task.h
@@ -87,6 +87,10 @@ class TaskResult {
     value_ = std::forward<T>(t);
   }
 
+  void SetVoid() {
+    value_ = void_result();
+  }
+
   /** Sets an exception. */
   void SetException(std::exception_ptr e) {
     exception_ = std::move(e);
@@ -143,8 +147,9 @@ static constexpr int ScalarResult = -1;
 class TaskResults : public SmallVector<SharedTaskResult, 4> {
  public:
   void Init(int num_results = ScalarResult) {
-    assert(num_results == ScalarResult || num_results > 0);
-    is_scalar_ = num_results < 0;
+    assert(num_results == ScalarResult || num_results >= 0);
+    is_scalar_ = num_results <= 0;
+    is_empty_ = num_results == 0;
     resize(std::max(num_results, 1));
     for (auto &r : *this)
       r = std::make_shared<TaskResult>();
@@ -153,12 +158,23 @@ class TaskResults : public SmallVector<SharedTaskResult, 4> {
   /** If true, the object represents a single, scalar result. */
   bool IsScalar() const noexcept { return is_scalar_; }
 
-  /** Returns the scalar return value of a task. */
+  bool IsEmpty() const noexcept { return is_empty_; }
+
+  int NumValues() const noexcept {
+    return size();
+  }
+
+  /** Returns the scalar return value of a task.
+   *
+   * NOTE: If the task has 0 outputs, you can use Value<void>() to rethrow the exception, if any.
+   */
   template <typename T>
   decltype(auto) Value() const {
-    if (!is_scalar_)
+    if (!is_scalar_) {
       throw std::logic_error("Cannot use argumentless Value to get a non-scalar value");
-    return Value<T>(0);
+    } else {
+      return Value<T>(0);
+    }
   }
 
   /** Returns one of the return values of a task.
@@ -187,16 +203,22 @@ class TaskResults : public SmallVector<SharedTaskResult, 4> {
 
   /** Returns the SharedTaskResult at the specified index or throws std::out_of_range. */
   const SharedTaskResult &GetChecked(int index) const & {
-    if (index < 0 || static_cast<size_t>(index) >= size())
+    if (index < 0 || index >= NumValues())
       throw std::out_of_range(
           "The result index out of range. Valid range is [0.." +
-          std::to_string(size() - 1) + "], got: " +
+          std::to_string(NumValues() - 1) + "], got: " +
           std::to_string(index));
     return (*this)[index];
   }
 
+  void SetException(std::exception_ptr exception) {
+    for (auto &r : *this)
+      r->SetException(exception);
+  }
+
  private:
   bool is_scalar_ = true;
+  bool is_empty_ = false;
 };
 
 enum class TaskState {
@@ -248,7 +270,7 @@ class Task : public CompletionEvent {
   template <typename F, typename... Args>
   void SetResult(F &&f, Args &&...args) {
     assert(state_ == TaskState::Running);
-    assert(results_.size() == 1 && results_[0]);
+    assert(results_.NumValues() == 1 && results_[0]);
     results_[0]->SetResultOf([&]() { return std::forward<F>(f)(std::forward<Args>(args)...); });
   }
 
@@ -272,9 +294,9 @@ class Task : public CompletionEvent {
         std::forward<Args>(args)...))>;
       if constexpr (detail::is_iterable_v<result_t>) {
         auto &&results = f(std::forward<Args>(args)...);
-        size_t n = 0;
+        int n = 0;
         for (auto &&r : results) {
-          if (n >= results_.size())
+          if (n >= results_.NumValues())
             throw std::logic_error("The function provided more results than "
                                    "the task was declared to have.");
           using T = std::remove_reference_t<decltype(r)>;
@@ -282,21 +304,27 @@ class Task : public CompletionEvent {
           n++;
         }
 
-        if (n < results_.size())
-          throw std::logic_error("The function provided fewer results than "
-                                 "the task was declared to have.");
+        if (n < results_.NumValues()) {
+          if (!(n == 0 && results_.IsEmpty()))
+            throw std::logic_error("The function provided fewer results than "
+                                   "the task was declared to have.");
+          else
+            results_[0]->SetVoid();
+        }
       } else if constexpr (detail::is_tuple_v<result_t>) {  // NOLINT
-        assert(std::tuple_size_v<result_t> == results_.size() &&
+        int tuple_size = std::tuple_size_v<result_t>;
+        assert((tuple_size == results_.NumValues() || (tuple_size == 0 && results_.IsEmpty())) &&
                "Internal error - incorrect tuple size should have been detected earlier.");
         auto &&results = f(std::forward<Args>(args)...);
-        UnpackResults<0>(results_, std::move(results));
+        if (results_.IsEmpty())
+          results_[0]->SetVoid();
+        else
+          UnpackResults<0>(results_, std::move(results));
       } else {
         assert(!"Internal error - the output type should have been rejected earlier.");
       }
     } catch (...) {
-      auto ex = std::current_exception();
-      for (auto &r : results_)
-        r->SetException(ex);
+      results_.SetException(std::current_exception());
     }
   }
 

From 3db39b15308932afc8d77ed2a82922556e61e41e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Micha=C5=82=20Zientkiewicz?= <michalz@nvidia.com>
Date: Mon, 14 Oct 2024 22:21:38 +0200
Subject: [PATCH 18/29] Add ready event to Tensor and TensorList. (#5673)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* Add ready event to Tensor and TensorList.

- move SharedEventLease to core and rename to CUDASharedEvent
- add more complete shared_ptr interface to CUDASharedEvent
- add tests for CUDASharedEvent
- add (set_)ready_event to Tensor and TensorList
- minor refactoring in TensorList
- remove OperatorIO::event in favor of TensorList's ready_event in exec2
- Propagate ready_event in As(Reshaped)Tensor.

---------

Signed-off-by: Michał Zientkiewicz <mzient@gmail.com>
---
 dali/core/cuda_event_pool_test.cc             |  51 ++++++-
 dali/pipeline/data/tensor.h                   |  32 ++++-
 dali/pipeline/data/tensor_list.cc             |  23 ++--
 dali/pipeline/data/tensor_list.h              |  29 +++-
 dali/pipeline/data/tensor_list_test.cc        |  10 +-
 .../pipeline/executor/executor2/exec_graph.cc |   4 +-
 dali/pipeline/executor/executor2/exec_graph.h |  10 +-
 .../executor/executor2/exec_node_task.cc      |  45 +++---
 .../executor/executor2/exec_node_task.h       |   8 +-
 .../executor/executor2/shared_event_lease.h   |  75 ----------
 include/dali/core/cuda_shared_event.h         | 130 ++++++++++++++++++
 11 files changed, 288 insertions(+), 129 deletions(-)
 delete mode 100644 dali/pipeline/executor/executor2/shared_event_lease.h
 create mode 100644 include/dali/core/cuda_shared_event.h

diff --git a/dali/core/cuda_event_pool_test.cc b/dali/core/cuda_event_pool_test.cc
index f08637efc0..1366621e3a 100644
--- a/dali/core/cuda_event_pool_test.cc
+++ b/dali/core/cuda_event_pool_test.cc
@@ -1,4 +1,4 @@
-// Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
+// Copyright (c) 2020, 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -20,6 +20,7 @@
 #include "dali/core/cuda_error.h"
 #include "dali/core/cuda_event_pool.h"
 #include "dali/core/cuda_stream.h"
+#include "dali/core/cuda_shared_event.h"
 
 namespace dali {
 namespace test {
@@ -58,5 +59,53 @@ TEST(EventPoolTest, PutGet) {
     t.join();
 }
 
+TEST(CUDASharedEventTest, RefCounting) {
+  int devices = 0;
+  (void)cudaGetDeviceCount(&devices);
+  if (devices == 0) {
+    (void)cudaGetLastError();  // No CUDA devices - we don't care about the error
+    GTEST_SKIP();
+  }
+
+  CUDASharedEvent ev1 = CUDASharedEvent::GetFromPool();
+  CUDASharedEvent ev2 = CUDASharedEvent::GetFromPool();
+  ASSERT_EQ(ev1, ev1.get()) << "Sanity check failed - object not equal to itself.";
+  ASSERT_NE(ev1.get(), nullptr) << "Sanity check failed - returned null instead of throwing.";
+  ASSERT_NE(ev2.get(), nullptr) << "Sanity check failed - returned null instead of throwing.";
+  ASSERT_NE(ev1, nullptr) << "Sanity check failed - comparison to null broken.";
+  ASSERT_NE(ev1, ev2) << "Sanity check failed - returned the same object twice.";
+
+  EXPECT_EQ(ev1.use_count(), 1);
+  EXPECT_EQ(ev2.use_count(), 1);
+  CUDASharedEvent ev3 = ev1;
+  EXPECT_EQ(ev1, ev3);
+  EXPECT_EQ(ev1.use_count(), 2);
+  EXPECT_EQ(ev3.use_count(), 2);
+  ev1.reset();
+  EXPECT_EQ(ev1.use_count(), 0);
+  EXPECT_EQ(ev3.use_count(), 1);
+}
+
+TEST(CUDASharedEventTest, ReturnToPool) {
+  int devices = 0;
+  (void)cudaGetDeviceCount(&devices);
+  if (devices == 0) {
+    (void)cudaGetLastError();  // No CUDA devices - we don't care about the error
+    GTEST_SKIP();
+  }
+
+  CUDAEventPool pool;
+
+  CUDASharedEvent ev1 = CUDASharedEvent::GetFromPool(pool);
+  EXPECT_NE(ev1, nullptr);
+  cudaEvent_t orig = ev1.get();
+  ev1.reset();
+  EXPECT_EQ(ev1, nullptr);
+  CUDASharedEvent ev2 = CUDASharedEvent::GetFromPool(pool);
+  EXPECT_EQ(ev2.get(), orig) << "Should have got the sole event from the pool";
+  ev1 = CUDASharedEvent::GetFromPool(pool);
+  EXPECT_NE(ev1, ev2);
+}
+
 }  // namespace test
 }  // namespace dali
diff --git a/dali/pipeline/data/tensor.h b/dali/pipeline/data/tensor.h
index 3088b4365c..82b5a8df2c 100644
--- a/dali/pipeline/data/tensor.h
+++ b/dali/pipeline/data/tensor.h
@@ -24,10 +24,11 @@
 
 #include "dali/core/common.h"
 #include "dali/core/error_handling.h"
-#include "dali/core/util.h"
+#include "dali/core/cuda_shared_event.h"
 #include "dali/core/span.h"
 #include "dali/core/traits.h"
 #include "dali/core/tensor_shape.h"
+#include "dali/core/util.h"
 #include "dali/pipeline/data/backend.h"
 #include "dali/pipeline/data/buffer.h"
 #include "dali/pipeline/data/meta.h"
@@ -208,6 +209,7 @@ class Tensor : public Buffer<Backend> {
     // Copy the tensor's meta-data
     shape_ = t.shape_;
     meta_ = t.meta_;
+    ready_ = t.ready_;
   }
 
   /**
@@ -227,7 +229,7 @@ class Tensor : public Buffer<Backend> {
    */
   inline void ShareData(shared_ptr<void> ptr, size_t bytes, bool pinned,
                         const TensorShape<> &shape, DALIDataType type, int device_id,
-                        AccessOrder order = {}) {
+                        AccessOrder order = {}, CUDASharedEvent ready = {}) {
     Index new_size = volume(shape);
     DALI_ENFORCE(new_size == 0 || type != DALI_NO_TYPE,
       "Only empty tensors can be shared without specifying a type.");
@@ -246,6 +248,7 @@ class Tensor : public Buffer<Backend> {
     size_ = new_size;
     num_bytes_ = bytes;
     device_ = device_id;
+    ready_ = std::move(ready);
 
     // If the input pointer stores a non-zero size allocation, mark
     // that we are sharing our underlying data
@@ -276,8 +279,10 @@ class Tensor : public Buffer<Backend> {
    * the dependency on the work that is happening on another device.
    */
   inline void ShareData(void *ptr, size_t bytes, bool pinned, const TensorShape<> &shape,
-                        DALIDataType type, int device_id, AccessOrder order = {}) {
-    ShareData(shared_ptr<void>(ptr, [](void *) {}), bytes, pinned, shape, type, device_id, order);
+                        DALIDataType type, int device_id, AccessOrder order = {},
+                        CUDASharedEvent ready = {}) {
+    ShareData(shared_ptr<void>(ptr, [](void *) {}), bytes, pinned, shape, type,
+                               device_id, order, std::move(ready));
   }
 
   /**
@@ -302,8 +307,8 @@ class Tensor : public Buffer<Backend> {
    * the dependency on the work that is happening on another device.
    */
   inline void ShareData(void *ptr, size_t bytes, bool pinned, DALIDataType type, int device_id,
-                        AccessOrder order = {}) {
-    ShareData(ptr, bytes, pinned, { 0 }, type, device_id, order);
+                        AccessOrder order = {}, CUDASharedEvent ready = {}) {
+    ShareData(ptr, bytes, pinned, { 0 }, type, device_id, order, std::move(ready));
   }
 
   inline void Reset(AccessOrder order = {}) {
@@ -449,9 +454,24 @@ class Tensor : public Buffer<Backend> {
     return meta_.ShouldSkipSample();
   }
 
+  /** Returns an optional, shared handle to CUDA event that marks the readiness of the tensor data.
+   *
+   * This ready event may be shared by multiple tensor lists or tensors. It may also be null.
+   * Typical DALI operators don't need to record or wait for this event.
+   */
+  const CUDASharedEvent &ready_event() const {
+    return ready_;
+  }
+
+  /** Sets the shared event handle that marks the readiness of the tensor data. */
+  void set_ready_event(CUDASharedEvent ready) {
+    ready_ = std::move(ready);
+  }
+
  protected:
   TensorShape<> shape_ = { 0 };
   DALIMeta meta_;
+  CUDASharedEvent ready_;
   USE_BUFFER_MEMBERS();
 
   // So TensorList can access data_ of the tensor directly
diff --git a/dali/pipeline/data/tensor_list.cc b/dali/pipeline/data/tensor_list.cc
index 13c33bd809..75646ef5af 100644
--- a/dali/pipeline/data/tensor_list.cc
+++ b/dali/pipeline/data/tensor_list.cc
@@ -193,7 +193,6 @@ template <typename Backend>
 TensorList<Backend> &TensorList<Backend>::operator=(TensorList<Backend> &&other) noexcept {
   if (&other != this) {
     contiguous_buffer_ = std::move(other.contiguous_buffer_);
-    buffer_bkp_ = std::move(other.buffer_bkp_);
     tensors_ = std::move(other.tensors_);
 
     state_ = other.state_;
@@ -759,25 +758,23 @@ void TensorList<Backend>::MakeNoncontiguous() {
 
 template <typename Backend>
 void TensorList<Backend>::DoMakeNoncontiguous() {
-  // We clear the contiguous_buffer_, as we are now non-contiguous.
-  buffer_bkp_ = contiguous_buffer_.get_data_ptr();
-  contiguous_buffer_.reset();
+  auto &contiguous_ptr = contiguous_buffer_.get_data_ptr();
   for (auto &t : tensors_) {
     // If the Tensor was aliasing the contiguous buffer, mark it as not sharing any data.
     // This will allow for the individual buffers to be resized.
     // The downside of this is we may keep the big contiguous buffer until all individual
     // samples are replaced.
-    if (same_managed_object(buffer_bkp_, t.data_)) {
+    if (same_managed_object(contiguous_ptr, t.data_)) {
       t.detach();
     }
   }
+  contiguous_buffer_.reset();
 }
 
 
 template <typename Backend>
 void TensorList<Backend>::Reset() {
   contiguous_buffer_.reset();
-  buffer_bkp_.reset();
   // TODO(klecki): Is there any benefit to call Reset on all?
   tensors_.clear();
 
@@ -786,8 +783,8 @@ void TensorList<Backend>::Reset() {
   sample_dim_ = -1;
   shape_ = {};
   layout_ = "";
-  // N.B. state_, pinned_, order_ and device_ are not reset here, as they might be previously set
-  // up via the executor - TODO(klecki) - consider if we want to keep this behaviour
+  // N.B. state_, pinned_, order_, device_ and ready_ are not reset here, as they might be
+  // previously set up via the executor - TODO(klecki) - consider if we want to keep this behaviour
 }
 
 
@@ -857,8 +854,6 @@ void TensorList<Backend>::ShareData(const TensorList<Backend> &tl) {
   if (!same_data)
     Reset();
 
-  buffer_bkp_.reset();  // TODO(michalz): perhaps we should copy it from the source, too?
-
   state_ = tl.state_;
   curr_num_tensors_ = tl.curr_num_tensors_;
   type_ = tl.type_;
@@ -868,6 +863,7 @@ void TensorList<Backend>::ShareData(const TensorList<Backend> &tl) {
   pinned_ = tl.pinned_;
   order_ = tl.order_;
   device_ = tl.device_;
+  ready_ = tl.ready_;
 
   if (tl.IsContiguous()) {
     if (!same_data)
@@ -946,7 +942,7 @@ Tensor<Backend> TensorList<Backend>::AsReshapedTensor(const TensorShape<> &new_s
   }
 
   result.ShareData(std::move(ptr), capacity(), is_pinned(),
-                   new_shape, type(), device_id(), order());
+                   new_shape, type(), device_id(), order(), ready_);
 
   auto result_layout = GetLayout();
   if (result_layout.ndim() + 1 == new_shape.sample_dim()) {
@@ -974,11 +970,11 @@ Tensor<Backend> TensorList<Backend>::AsTensor() {
 template <typename Backend>
 void TensorList<Backend>::ShareData(shared_ptr<void> ptr, size_t bytes, bool pinned,
                                     const TensorListShape<> &shape, DALIDataType type,
-                                    int device_id, AccessOrder order, const TensorLayout &layout) {
+                                    int device_id, AccessOrder order, const TensorLayout &layout,
+                                    CUDASharedEvent ready) {
   contiguous_buffer_.set_backing_allocation(std::move(ptr), bytes, pinned,
                                             type, shape.num_elements(),
                                             device_id, order);
-  buffer_bkp_.reset();
   tensors_.clear();
   tensors_.resize(shape.num_samples());
 
@@ -990,6 +986,7 @@ void TensorList<Backend>::ShareData(shared_ptr<void> ptr, size_t bytes, bool pin
   layout_ = layout;
   pinned_ = pinned;
   device_ = device_id;
+  ready_ = ready;
   if (order)
     order_ = order;
   recreate_views();
diff --git a/dali/pipeline/data/tensor_list.h b/dali/pipeline/data/tensor_list.h
index d39332f928..ad7d2e1b8d 100644
--- a/dali/pipeline/data/tensor_list.h
+++ b/dali/pipeline/data/tensor_list.h
@@ -485,7 +485,8 @@ class DLL_PUBLIC TensorList {
    */
   DLL_PUBLIC void ShareData(shared_ptr<void> ptr, size_t bytes, bool pinned,
                             const TensorListShape<> &shape, DALIDataType type, int device_id,
-                            AccessOrder order = {}, const TensorLayout &layout = "");
+                            AccessOrder order = {}, const TensorLayout &layout = "",
+                            CUDASharedEvent ready = {});
 
   /**
    * @brief Set other batch as backing memory for this one. Preserves the contiguity status.
@@ -587,6 +588,22 @@ class DLL_PUBLIC TensorList {
    */
   size_t capacity() const noexcept;
 
+  /** Returns an optional, shared handle to CUDA event that marks the readiness of the data.
+   *
+   * This ready event may be shared by multiple tensor lists or tensors. It may also be null.
+   * Typical DALI operators don't need to record or wait for this event.
+   */
+  const CUDASharedEvent &ready_event() const {
+    return ready_;
+  }
+
+  /** Sets the shared event handle that marks the readiness of the data. */
+  void set_ready_event(CUDASharedEvent ready) {
+    ready_ = std::move(ready);
+  }
+
+
+
   /**
    * @brief Returns the size in bytes of the underlying data chunks
    * TODO(klecki): Temporary API to be reworked, do not use.
@@ -765,7 +782,6 @@ class DLL_PUBLIC TensorList {
 
   // Memory backing
   Buffer<Backend> contiguous_buffer_;
-  std::weak_ptr<void> buffer_bkp_;
   // Memory, sample aliases and metadata
   // TODO(klecki): Remove SampleWorkspace (only place where we actually need those Tensor objects)
   // and swap to plain Buffer instead of using actual Tensors.
@@ -774,15 +790,16 @@ class DLL_PUBLIC TensorList {
   // State and metadata that should be uniform regardless of the contiguity state.
   // Sample aliases should match the information stored below.
   State state_;
-  int curr_num_tensors_;
-  TypeInfo type_{};
+  bool pinned_ = true;
+  int curr_num_tensors_ = 0;
   int sample_dim_ = -1;
+  int device_ = CPU_ONLY_DEVICE_ID;
+  TypeInfo type_{};
   TensorListShape<> shape_;
   TensorLayout layout_;
 
-  bool pinned_ = true;
-  int device_ = CPU_ONLY_DEVICE_ID;
   AccessOrder order_ = AccessOrder::host();
+  CUDASharedEvent ready_;
 
   // So we can access the members of other TensorLists
   // with different template types
diff --git a/dali/pipeline/data/tensor_list_test.cc b/dali/pipeline/data/tensor_list_test.cc
index 4cef10721b..b1bda90c19 100644
--- a/dali/pipeline/data/tensor_list_test.cc
+++ b/dali/pipeline/data/tensor_list_test.cc
@@ -1994,12 +1994,18 @@ TEST_F(TensorListVariableBatchSizeTest, UpdatePropertiesFromSamples) {
 }
 
 TEST(TensorList, ResizeOverheadPerf) {
-  cudaFree(0);
+  (void)cudaFree(0);
+#ifdef DALI_DEBUG
+  int niter = 2000;
+  int warmup = 500;
+#else
   int niter = 20000;
+  int warmup = 5000;
+#endif
   int total_size = 256 << 10;
   int nsamples = 1024;
   auto shape = uniform_list_shape(nsamples, {total_size / nsamples});
-  for (int i = 0; i < 5000; i++) {
+  for (int i = 0; i < warmup; i++) {
     TensorList<CPUBackend> tl;
     tl.set_pinned(false);
     tl.Resize(shape, DALI_UINT8);
diff --git a/dali/pipeline/executor/executor2/exec_graph.cc b/dali/pipeline/executor/executor2/exec_graph.cc
index 0db1867644..0366785578 100644
--- a/dali/pipeline/executor/executor2/exec_graph.cc
+++ b/dali/pipeline/executor/executor2/exec_graph.cc
@@ -143,7 +143,7 @@ std::unique_ptr<Workspace> ExecNode::CreateOpWorkspace() {
   return ws;
 }
 
-std::pair<std::unique_ptr<Workspace>, SharedEventLease>
+std::pair<std::unique_ptr<Workspace>, CUDASharedEvent>
 ExecNode::GetWorkspace(WorkspaceParams params) {
   if (!ws_) {
     assert(!has_workspace_);
@@ -160,7 +160,7 @@ ExecNode::GetWorkspace(WorkspaceParams params) {
   ApplyWorkspaceParams(*ws_, params);
 
   if (ws_->output_order().is_device())
-    ws_event_ = SharedEventLease::Get();
+    ws_event_ = CUDASharedEvent::GetFromPool();
   else
     ws_event_.reset();
   ws_->set_event(ws_event_);
diff --git a/dali/pipeline/executor/executor2/exec_graph.h b/dali/pipeline/executor/executor2/exec_graph.h
index 1f891f5411..78c710b269 100644
--- a/dali/pipeline/executor/executor2/exec_graph.h
+++ b/dali/pipeline/executor/executor2/exec_graph.h
@@ -25,7 +25,7 @@
 #include <unordered_map>
 #include <vector>
 
-#include "dali/pipeline/executor/executor2/shared_event_lease.h"
+#include "dali/core/cuda_shared_event.h"
 #include "dali/pipeline/operator/operator.h"
 #include "dali/pipeline/workspace/workspace.h"
 
@@ -62,13 +62,13 @@ struct PipelineOutput {
   PipelineOutput(const PipelineOutput &) {
     throw std::logic_error("This object is not copyable, but std::any needs it at compile time.");
   }
-  PipelineOutput(const Workspace &ws, SharedEventLease event, std::optional<int> device)
+  PipelineOutput(const Workspace &ws, CUDASharedEvent event, std::optional<int> device)
   : workspace(ws), event(std::move(event)), device(device) {}
 
   /** The payload */
   Workspace workspace;
   /** Owns the event used by the workspace */
-  SharedEventLease event;
+  CUDASharedEvent event;
   /** The ordinal of the device used by the workspace */
   std::optional<int> device;
 };
@@ -179,7 +179,7 @@ class DLL_PUBLIC ExecNode {
    * Requesting multiple workspaces is a coding error.
    * The workspace is updated with the WorkspaceParams supplied to this function.
    */
-  std::pair<std::unique_ptr<Workspace>, SharedEventLease> GetWorkspace(WorkspaceParams params);
+  std::pair<std::unique_ptr<Workspace>, CUDASharedEvent> GetWorkspace(WorkspaceParams params);
 
   /** Puts the workspace back into the node for later reuse.
    *
@@ -244,7 +244,7 @@ class DLL_PUBLIC ExecNode {
   bool has_workspace_ = false;
 
   /** The event associated with the workspace */
-  SharedEventLease ws_event_;
+  CUDASharedEvent ws_event_;
 
   /** Moves to a new iteration. */
   void NextIter() {
diff --git a/dali/pipeline/executor/executor2/exec_node_task.cc b/dali/pipeline/executor/executor2/exec_node_task.cc
index 99ef627bce..45fa1ed43c 100644
--- a/dali/pipeline/executor/executor2/exec_node_task.cc
+++ b/dali/pipeline/executor/executor2/exec_node_task.cc
@@ -281,11 +281,13 @@ void OpTask::SetWorkspaceInputs() {
     const auto &inp = TaskInput<Backend>(ti);
     bool is_meta = node_->inputs[i]->metadata;
     // metadata-only inputs don't need to be synchronized
-    if (!is_meta && inp.event && inp.order != order)
-      events.insert(inp.event);
+    if (!is_meta && inp.event() && inp.order != order)
+      events.insert(inp.event());
 
-    // metadata-only inputs don't need a proper stream
-    if (inp.order == order || is_meta) {  // use the input directly
+    bool is_plain_host = std::is_same_v<Backend, CPUBackend> && !inp.data->is_pinned();
+
+    // metadata-only inputs && non-pinned host inputs don't need a proper stream
+    if (inp.order == order || is_meta || is_plain_host) {  // use the input directly
       ws_->SetInput(i, inp.data);
     } else {  // create another TL and set its order (and layout, while we're at it)
       auto tl = std::make_shared<TensorList<Backend>>();
@@ -308,8 +310,8 @@ void OpTask::SetWorkspaceInputs() {
 
   for (int i = 0; i < ws_->NumArgumentInput(); i++, ti++) {
     auto &inp = TaskInput<CPUBackend>(ti);
-    if (inp.event)
-      events.insert(inp.event);
+    if (inp.event())
+      events.insert(inp.event());
     ws_->SetArgumentInput(i, inp.data);
   }
 
@@ -367,11 +369,15 @@ OpTask::OpTaskOutputs OpTask::GetWorkspaceOutputs() {
       // The consumer stream will be properly synchronized in SetWorkspaceInputs.
       // This is done to facilitate freeing of memory - if we're able to transfer the
       // object to the consumption stream, it'll be freed in consumption order.
-      if (!ptr->shares_data()) {
-        if (AccessOrder consumer_order = OutputConsumerStream(o))
-          ptr->set_order(consumer_order, false);
+      if (ptr->is_pinned()) {
+        if (!ptr->shares_data()) {
+          if (AccessOrder consumer_order = OutputConsumerStream(o)) {
+            ptr->set_order(consumer_order, false);
+          }
+        }
+        ptr->set_ready_event(event_);
       }
-      ret.push_back(OperatorIO<CPUBackend>{std::move(ptr), event_, order});
+      ret.push_back(OperatorIO<CPUBackend>{std::move(ptr), order});
     } else {
       assert(ws_->OutputIsType<GPUBackend>(o));
       auto ptr = ws_->OutputPtr<GPUBackend>(o);
@@ -381,7 +387,8 @@ OpTask::OpTaskOutputs OpTask::GetWorkspaceOutputs() {
           ptr->set_order(consumer_order, false);
         }
       }
-      ret.push_back(OperatorIO<GPUBackend>{ws_->OutputPtr<GPUBackend>(o), event_, order});
+      ptr->set_ready_event(event_);
+      ret.push_back(OperatorIO<GPUBackend>{ws_->OutputPtr<GPUBackend>(o), order});
     }
   }
 
@@ -424,14 +431,14 @@ PipelineOutput OutputTask::Run() {
   for (int o = 0; o < ws_->NumOutput(); o++) {
     if (ws_->OutputIsType<CPUBackend>(o)) {
       auto &inp = TaskInput<CPUBackend>(o);
-      if (inp.event)
-        events.insert(inp.event);
+      if (inp.event())
+        events.insert(inp.event());
       ws_->SetOutput(o, inp.data);
     } else {
       assert(ws_->OutputIsType<GPUBackend>(o));
       auto &inp = TaskInput<GPUBackend>(o);
-      if (inp.event)
-        events.insert(inp.event);
+      if (inp.event())
+        events.insert(inp.event());
       ws_->SetOutput(o, inp.data);
     }
   }
@@ -442,10 +449,16 @@ PipelineOutput OutputTask::Run() {
   for (int o = 0; o < ws_->NumOutput(); o++) {
     if (ws_->OutputIsType<CPUBackend>(o)) {
       auto &out = ws_->Output<CPUBackend>(o);
-      out.set_order(ws_->output_order(), false);
+      if (out.is_pinned()) {
+        out.set_order(ws_->output_order(), false);
+        out.set_ready_event(event_);
+      } else {
+        assert(out.order() == AccessOrder::host());
+      }
     } else {
       auto &out = ws_->Output<GPUBackend>(o);
       out.set_order(ws_->output_order(), false);
+      out.set_ready_event(event_);
     }
   }
 
diff --git a/dali/pipeline/executor/executor2/exec_node_task.h b/dali/pipeline/executor/executor2/exec_node_task.h
index fca9188b9a..33aae4037c 100644
--- a/dali/pipeline/executor/executor2/exec_node_task.h
+++ b/dali/pipeline/executor/executor2/exec_node_task.h
@@ -19,7 +19,7 @@
 #include <utility>
 #include "dali/core/exec/tasking.h"
 #include "dali/core/call_at_exit.h"
-#include "dali/pipeline/executor/executor2/shared_event_lease.h"
+#include "dali/core/cuda_shared_event.h"
 #include "dali/pipeline/executor/executor2/exec_graph.h"
 
 namespace dali {
@@ -57,13 +57,15 @@ class ExecNodeTask {
   ExecNode *node_ = nullptr;
   WorkspaceParams ws_params_{};
   std::unique_ptr<Workspace> ws_ = nullptr;
-  SharedEventLease event_;
+  CUDASharedEvent event_;
 
   template <typename Backend>
   struct OperatorIO {
     std::shared_ptr<TensorList<Backend>> data;
-    SharedEventLease event;
     AccessOrder order = AccessOrder::host();
+    cudaEvent_t event() const {
+      return data ? data->ready_event().get() : nullptr;
+    }
   };
 
   template <typename Backend>
diff --git a/dali/pipeline/executor/executor2/shared_event_lease.h b/dali/pipeline/executor/executor2/shared_event_lease.h
deleted file mode 100644
index d953cb681e..0000000000
--- a/dali/pipeline/executor/executor2/shared_event_lease.h
+++ /dev/null
@@ -1,75 +0,0 @@
-// Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef DALI_PIPELINE_EXECUTOR_EXECUTOR2_SHARED_EVENT_LEASE_H_
-#define DALI_PIPELINE_EXECUTOR_EXECUTOR2_SHARED_EVENT_LEASE_H_
-
-#include <cuda_runtime.h>
-#include <memory>
-#include <utility>
-#include "dali/core/cuda_event_pool.h"
-#include "dali/core/cuda_error.h"
-
-namespace dali {
-namespace exec2 {
-
-class SharedEventLease {
- public:
-  SharedEventLease() = default;
-  explicit SharedEventLease(CUDAEvent &&event, int device_id = -1) {
-    if (device_id < 0)
-      CUDA_CALL(cudaGetDevice(&device_id));
-
-    event_ = std::shared_ptr<void>(event.get(), [device_id](void *e) {
-      CUDAEventPool::instance().Put(CUDAEvent(static_cast<cudaEvent_t>(e)), device_id);
-    });
-
-    event.release();
-  }
-
-  static SharedEventLease Get(int device_id = -1) {
-    if (device_id < 0)
-      CUDA_CALL(cudaGetDevice(&device_id));
-    CUDAEvent event = CUDAEventPool::instance().Get(device_id);
-    return SharedEventLease(std::move(event), device_id);
-  }
-
-  void reset() {
-    event_.reset();
-  }
-
-  cudaEvent_t get() const {
-    return static_cast<cudaEvent_t>(event_.get());
-  }
-
-  explicit operator bool() const {
-    return event_ != nullptr;
-  }
-
-  operator cudaEvent_t() const {
-    return get();
-  }
-
- private:
-  // Hack: use shared_ptr<void> to store a CUDA event - shared_ptr doesn't care whether the pointer
-  // it manages is a real pointer or something else as long as:
-  // - null value is equivalent to nullptr
-  // - the provided deleter can free the object.
-  std::shared_ptr<void> event_;
-};
-
-}  // namespace exec2
-}  // namespace dali
-
-#endif  // DALI_PIPELINE_EXECUTOR_EXECUTOR2_SHARED_EVENT_LEASE_H_
diff --git a/include/dali/core/cuda_shared_event.h b/include/dali/core/cuda_shared_event.h
new file mode 100644
index 0000000000..d86a83d5b5
--- /dev/null
+++ b/include/dali/core/cuda_shared_event.h
@@ -0,0 +1,130 @@
+// Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef DALI_CORE_CUDA_SHARED_EVENT_H_
+#define DALI_CORE_CUDA_SHARED_EVENT_H_
+
+#include <cuda_runtime_api.h>
+#include <memory>
+#include <utility>
+#include "dali/core/cuda_event_pool.h"
+#include "dali/core/cuda_error.h"
+
+namespace dali {
+
+/** A reference counting wrapper around cudaEvent_t.
+ *
+ * This class wraps a cudaEvent_t in a shared_ptr-like interface.
+ * Internally it uses a std::shared_ptr to manage the handle.
+ *
+ * The class provides convenience functions for getting the event from CUDAEventPool.
+ */
+class CUDASharedEvent {
+ public:
+  CUDASharedEvent() = default;
+
+  template <typename EventDeleter>
+  CUDASharedEvent(cudaEvent_t event, EventDeleter &&deleter)
+  : event_{
+    event,
+    [del = std::move(deleter)](void *handle) mutable {
+      del(static_cast<cudaEvent_t>(handle));
+    }} {}
+
+  explicit CUDASharedEvent(CUDAEvent event)
+  : CUDASharedEvent(event.get(), CUDAEvent::DestroyHandle)  {
+    (void)event.release();
+  }
+
+  template <typename EventDeleter>
+  explicit CUDASharedEvent(CUDAEvent event, EventDeleter &&deleter)
+  : CUDASharedEvent(event.get(), std::forward<EventDeleter>(deleter)) {
+    (void)event.release();
+  }
+
+  static CUDASharedEvent GetFromPool(CUDAEventPool &pool, int device_id = -1) {
+    if (device_id < 0)
+      CUDA_CALL(cudaGetDevice(&device_id));
+    CUDAEvent &&event = pool.Get(device_id);
+    return CUDASharedEvent(
+        std::move(event),
+        [device_id, owner = &pool](void *e) {
+          owner->Put(CUDAEvent(static_cast<cudaEvent_t>(e)), device_id);
+        });
+  }
+
+  static CUDASharedEvent GetFromPool(int device_id = -1) {
+    return GetFromPool(CUDAEventPool::instance(), device_id);
+  }
+
+  static CUDASharedEvent Create(int device_id = -1) {
+    return CUDASharedEvent(CUDAEvent::Create(device_id));
+  }
+
+  void reset() noexcept {
+    event_.reset();
+  }
+
+  cudaEvent_t get() const noexcept {
+    return static_cast<cudaEvent_t>(event_.get());
+  }
+
+  long use_count() const noexcept {  // NOLINT(runtime/int)
+    return event_.use_count();
+  }
+
+  explicit operator bool() const noexcept {
+    return static_cast<bool>(event_);
+  }
+
+  operator cudaEvent_t() const noexcept {
+    return get();
+  }
+
+  bool operator==(const CUDASharedEvent &other) const noexcept {
+    return get() == other.get();
+  }
+
+  bool operator!=(const CUDASharedEvent &other) const noexcept {
+    return get() != other.get();
+  }
+
+  bool operator==(cudaEvent_t event) const noexcept {
+    return get() == event;
+  }
+
+  bool operator!=(cudaEvent_t event) const noexcept {
+    return get() != event;
+  }
+
+
+  bool operator==(std::nullptr_t) const noexcept {
+    return get() == nullptr;
+  }
+
+  bool operator!=(std::nullptr_t) const noexcept {
+    return get() != nullptr;
+  }
+
+ private:
+  // Hack: use shared_ptr<void> to store a CUDA event - shared_ptr doesn't care whether the pointer
+  // it manages is a real pointer or something else as long as:
+  // - null value is equivalent to nullptr
+  // - the provided deleter can free the object.
+  std::shared_ptr<void> event_;
+};
+
+}  // namespace dali
+
+#endif  // DALI_CORE_CUDA_SHARED_EVENT_H_

From 1b51e15de972c4bd6058283e9b9e1bb04f147e25 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Micha=C5=82=20Zientkiewicz?= <michalz@nvidia.com>
Date: Thu, 17 Oct 2024 10:07:15 +0200
Subject: [PATCH 19/29] Refactor operator output contiguity handling (#5677)

Changes in operators:
- Rename CanInferOutputs to HasContiguousOutputs, because that's how
  it's used
- Change the default value to true
- Add (much fewer) implementations returning false
- Add checks to updating TL from samples (used in SampleWorkspace)
- Add opportunistic coalescing mode to MakeContiguous
Changes in executor(s):
- Set all MakeContiguous nodes to opportunistic in the new executor
- Remove return value check between CanInferOutputs and Setup in the old executor
Changes in TensorList
- rename unsafe_raw_data to contiguous_raw_data
- use IsContiguousInMemory instead of IsContiguous when obtaining contiguous buffer (as done in AsReshapedTensor)
- Simplify getting shared_ptr for a sample
Other:
- Adjust custom operator examples.

---------

Signed-off-by: Michal Zientkiewicz <michalz@nvidia.com>
---
 dali/benchmark/operator_bench.h               |  3 +-
 .../audio/mel_scale/mel_filter_bank.h         |  1 -
 dali/operators/audio/mfcc/mfcc.h              |  1 -
 dali/operators/audio/nonsilence_op.h          |  4 --
 dali/operators/audio/preemphasis_filter_op.h  |  4 --
 dali/operators/audio/resample.h               |  4 --
 dali/operators/bbox/bb_flip.h                 |  4 --
 dali/operators/bbox/bbox_paste.h              |  4 ++
 dali/operators/debug/dump_image.h             |  4 ++
 .../decoder/audio/audio_decoder_op.h          |  4 --
 dali/operators/decoder/host/host_decoder.h    |  4 ++
 dali/operators/decoder/inflate/inflate.h      |  4 --
 .../decoder/peek_shape/peek_image_shape.h     |  3 --
 .../decoder/video/video_decoder_cpu.h         |  4 --
 .../decoder/video/video_decoder_mixed.h       |  4 --
 dali/operators/generic/cast.h                 |  4 --
 dali/operators/generic/constant.h             |  7 ++--
 dali/operators/generic/constant_value.h       |  1 -
 dali/operators/generic/erase/erase.h          |  4 --
 dali/operators/generic/flip.h                 |  4 --
 dali/operators/generic/join.h                 |  1 -
 dali/operators/generic/lookup_table.h         |  4 --
 dali/operators/generic/one_hot.h              |  4 --
 dali/operators/generic/pad.h                  |  4 --
 dali/operators/generic/permute_batch.h        |  4 --
 dali/operators/generic/reduce/reduce.h        |  1 -
 .../generic/reduce/reduce_with_mean_input.h   |  1 -
 dali/operators/generic/reshape.h              |  5 +--
 dali/operators/generic/resize/tensor_resize.h |  1 -
 dali/operators/generic/roi_random_crop.cc     |  1 -
 dali/operators/generic/shapes.h               |  1 -
 dali/operators/generic/slice/slice_base.h     |  4 --
 dali/operators/generic/slice/subscript.h      |  1 -
 dali/operators/generic/transpose/transpose.h  |  4 --
 .../generic/transpose/transpose_gpu.cc        |  4 --
 .../affine_transforms/combine_transforms.cc   |  1 -
 .../affine_transforms/transform_base_op.h     |  1 -
 dali/operators/geometry/coord_flip.h          |  4 --
 dali/operators/geometry/coord_transform.h     |  1 -
 .../image/color/brightness_contrast.h         |  4 --
 .../image/color/color_space_conversion.h      |  2 -
 dali/operators/image/color/color_twist.h      |  4 --
 dali/operators/image/color/debayer.h          |  4 --
 dali/operators/image/color/equalize.h         |  4 --
 dali/operators/image/convolution/filter.h     |  4 --
 .../image/convolution/gaussian_blur.h         |  4 --
 dali/operators/image/convolution/laplacian.h  |  4 --
 .../image/crop/crop_mirror_normalize.h        |  4 --
 .../image/crop/new_crop_mirror_normalize.cu   |  4 --
 .../image/crop/random_crop_generator.h        |  1 -
 .../jpeg_compression_distortion_op.h          |  4 --
 dali/operators/image/filter/median_blur.cc    |  4 --
 dali/operators/image/mask/grid_mask.h         |  1 -
 dali/operators/image/morphology/morphology.h  |  4 --
 dali/operators/image/paste/multipaste.h       |  4 --
 .../image/remap/cvcuda/warp_perspective.cc    |  4 --
 .../remap/displacement_filter_impl_cpu.h      |  4 --
 .../remap/displacement_filter_impl_gpu.cuh    |  4 --
 dali/operators/image/remap/remap.h            |  5 ---
 dali/operators/image/remap/warp.h             |  4 --
 .../image/resize/experimental/resize.h        |  4 --
 .../image/resize/random_resized_crop.h        |  1 -
 dali/operators/image/resize/resize.h          |  1 -
 .../image/resize/resize_crop_mirror.h         |  1 -
 dali/operators/imgcodec/image_decoder.h       |  4 --
 dali/operators/imgcodec/peek_image_shape.cc   |  4 --
 dali/operators/imgcodec/peek_image_shape.h    |  2 -
 dali/operators/input/video_input.h            |  4 --
 dali/operators/io/file/file_read.cc           |  1 -
 dali/operators/math/expressions/arithmetic.h  |  4 --
 dali/operators/math/normalize/normalize.h     |  1 -
 dali/operators/numba_function/numba_func.h    |  2 -
 .../python_function/dltensor_function.h       |  4 ++
 dali/operators/python_function/jax_function.h |  4 ++
 dali/operators/random/batch_permutation.h     |  1 -
 dali/operators/random/rng_base.h              |  4 --
 dali/operators/random/uniform.h               |  4 --
 dali/operators/reader/file_reader_op.h        |  2 +-
 dali/operators/reader/fits_reader_op.h        |  2 +-
 dali/operators/reader/numpy_reader_op.h       |  2 +-
 dali/operators/reader/reader_op.h             |  4 ++
 dali/operators/reader/tfrecord_reader_op.h    |  2 +-
 .../reader/video_reader_decoder_gpu_op.h      |  2 +-
 dali/operators/reader/video_reader_op.cc      |  4 +-
 dali/operators/reader/webdataset_reader_op.h  |  2 +-
 .../segmentation/random_mask_pixel.cc         |  1 -
 .../segmentation/random_object_bbox.h         |  4 --
 dali/operators/segmentation/select_masks.h    |  4 --
 dali/operators/sequence/element_extract.h     |  4 --
 .../sequence/optical_flow/optical_flow.h      |  4 --
 dali/operators/sequence/per_frame.h           |  5 +--
 dali/operators/sequence/sequence_rearrange.h  |  4 --
 .../operators/signal/decibel/to_decibels_op.h |  1 -
 dali/operators/signal/fft/power_spectrum.h    |  1 -
 dali/operators/signal/fft/spectrogram.h       |  2 -
 dali/operators/ssd/box_encoder.h              |  4 ++
 dali/operators/ssd/random_crop.h              |  4 ++
 dali/operators/util/get_property.h            |  2 +-
 dali/pipeline/data/copy_to_external.h         |  7 ++--
 dali/pipeline/data/tensor_list.cc             | 28 ++++++++++++--
 dali/pipeline/data/tensor_list.h              | 34 ++++++++++-------
 dali/pipeline/data/tensor_list_test.cc        | 26 ++++++-------
 .../executor/executor2/exec2_ops_for_test.h   |  4 --
 .../executor/executor2/exec_graph_analysis.cc |  9 +++++
 .../executor2/stream_assignment_test.cc       |  5 +++
 dali/pipeline/executor/executor_impl.cc       | 20 +++++-----
 dali/pipeline/executor/executor_impl.h        |  4 +-
 dali/pipeline/executor/lowered_graph.cc       |  2 +-
 .../builtin/conditional/logical_not.h         |  8 ++--
 .../operator/builtin/conditional/merge.h      |  2 +-
 .../operator/builtin/conditional/split.h      |  2 +-
 .../conditional/validate_logical_expr.h       |  6 ++-
 dali/pipeline/operator/builtin/copy.h         |  4 --
 .../operator/builtin/external_source.h        |  4 +-
 .../operator/builtin/input_operator.h         |  4 +-
 .../operator/builtin/make_contiguous.cc       | 14 ++++++-
 .../operator/builtin/make_contiguous.cu       |  4 +-
 .../operator/builtin/make_contiguous.h        | 38 +++++++++++++++----
 dali/pipeline/operator/eager_operator.h       |  2 +-
 dali/pipeline/operator/false_gpu_operator.h   |  2 +-
 dali/pipeline/operator/op_spec_test.cc        |  8 ----
 dali/pipeline/operator/operator.h             | 22 ++++++++---
 dali/pipeline/operator/operator_test.cc       |  5 +++
 dali/pipeline/pipeline_test.cc                | 20 ++++++++++
 dali/python/backend_impl.cc                   |  8 ++--
 dali/test/operators/copy.h                    |  4 ++
 dali/test/operators/dummy_op.h                |  4 ++
 dali/test/operators/exception.h               |  4 ++
 dali/test/operators/passthrough.h             |  4 ++
 dali/test/operators/passthrough_with_trace.h  |  4 ++
 dali/test/operators/string_msg_helper.h       |  4 --
 dali/test/plugins/dummy/dummy.h               |  4 --
 .../create_a_custom_operator.ipynb            |  8 +---
 .../custom_operator/customdummy/dummy.h       |  4 --
 .../naive_histogram/naive_histogram.h         |  7 +---
 .../pkg_src/src/decoder/video_decoder_mixed.h |  4 --
 136 files changed, 268 insertions(+), 377 deletions(-)

diff --git a/dali/benchmark/operator_bench.h b/dali/benchmark/operator_bench.h
index 923c6b7a85..a7a002897d 100644
--- a/dali/benchmark/operator_bench.h
+++ b/dali/benchmark/operator_bench.h
@@ -29,8 +29,7 @@ class OperatorBench : public DALIBenchmark {
   template <typename OutputContainer, typename OperatorPtr, typename Workspace>
   void Setup(OperatorPtr &op_ptr, const OpSpec &spec, Workspace &ws, int batch_size) {
     std::vector<OutputDesc> outputs;
-    bool can_infer_outs = op_ptr->CanInferOutputs();
-    if (op_ptr->Setup(outputs, ws) && can_infer_outs) {
+    if (op_ptr->Setup(outputs, ws)) {
       int num_out = outputs.size();
       for (int i = 0; i < num_out; i++) {
         auto data_out = std::make_shared<OutputContainer>(batch_size);
diff --git a/dali/operators/audio/mel_scale/mel_filter_bank.h b/dali/operators/audio/mel_scale/mel_filter_bank.h
index de364b9702..2434aa189a 100644
--- a/dali/operators/audio/mel_scale/mel_filter_bank.h
+++ b/dali/operators/audio/mel_scale/mel_filter_bank.h
@@ -69,7 +69,6 @@ class MelFilterBank : public StatelessOperator<Backend> {
   }
 
  protected:
-  bool CanInferOutputs() const override { return true; }
   bool SetupImpl(std::vector<OutputDesc> &output_desc, const Workspace &ws) override;
   void RunImpl(Workspace &ws) override;
 
diff --git a/dali/operators/audio/mfcc/mfcc.h b/dali/operators/audio/mfcc/mfcc.h
index 019febba74..07815c49b1 100644
--- a/dali/operators/audio/mfcc/mfcc.h
+++ b/dali/operators/audio/mfcc/mfcc.h
@@ -79,7 +79,6 @@ class MFCC : public StatelessOperator<Backend> {
       : StatelessOperator<Backend>(spec) {}
 
  protected:
-  bool CanInferOutputs() const override { return true; }
   bool SetupImpl(std::vector<OutputDesc> &output_desc, const Workspace &ws) override;
   void RunImpl(Workspace &ws) override;
 
diff --git a/dali/operators/audio/nonsilence_op.h b/dali/operators/audio/nonsilence_op.h
index 463191e3c3..8b172c84b3 100644
--- a/dali/operators/audio/nonsilence_op.h
+++ b/dali/operators/audio/nonsilence_op.h
@@ -142,10 +142,6 @@ class NonsilenceOperator : public StatelessOperator<Backend> {
           StatelessOperator<Backend>(spec) {}
 
 
-  bool CanInferOutputs() const override {
-    return true;
-  }
-
   bool SetupImpl(std::vector<OutputDesc> &output_desc, const Workspace &ws) override {
     AcquireArgs(spec_, ws);
     TensorShape<> scalar_shape = {};
diff --git a/dali/operators/audio/preemphasis_filter_op.h b/dali/operators/audio/preemphasis_filter_op.h
index 471e83473f..8ced6461a4 100644
--- a/dali/operators/audio/preemphasis_filter_op.h
+++ b/dali/operators/audio/preemphasis_filter_op.h
@@ -62,10 +62,6 @@ class PreemphasisFilter : public StatelessOperator<Backend> {
   ~PreemphasisFilter() override = default;
   DISABLE_COPY_MOVE_ASSIGN(PreemphasisFilter);
 
-  bool CanInferOutputs() const override {
-    return true;
-  }
-
   bool SetupImpl(std::vector<::dali::OutputDesc> &output_desc,
                  const Workspace &ws) override {
     const auto &input = ws.Input<Backend>(0);
diff --git a/dali/operators/audio/resample.h b/dali/operators/audio/resample.h
index e64550c84d..a996ea57c3 100644
--- a/dali/operators/audio/resample.h
+++ b/dali/operators/audio/resample.h
@@ -50,10 +50,6 @@ class ResampleBase : public StatelessOperator<Backend> {
     }
   }
 
-  bool CanInferOutputs() const override {
-    return true;
-  }
-
   bool SetupImpl(std::vector<OutputDesc> &outputs, const Workspace &ws) override {
     outputs.resize(1);
     if (dtype_ == DALI_NO_TYPE)
diff --git a/dali/operators/bbox/bb_flip.h b/dali/operators/bbox/bb_flip.h
index 1a51b2671d..8fa7634759 100644
--- a/dali/operators/bbox/bb_flip.h
+++ b/dali/operators/bbox/bb_flip.h
@@ -38,10 +38,6 @@ class BbFlip : public StatelessOperator<Backend> {
   DISABLE_COPY_MOVE_ASSIGN(BbFlip);
 
  protected:
-  bool CanInferOutputs() const override {
-    return true;
-  }
-
   bool SetupImpl(std::vector<OutputDesc> &output_descs, const Workspace &ws) override {
     const auto &input = ws.Input<Backend>(0);
     DALI_ENFORCE(input.type() == DALI_FLOAT, "Bounding box in wrong format");
diff --git a/dali/operators/bbox/bbox_paste.h b/dali/operators/bbox/bbox_paste.h
index af66c41c8b..f7a4cf736a 100644
--- a/dali/operators/bbox/bbox_paste.h
+++ b/dali/operators/bbox/bbox_paste.h
@@ -36,6 +36,10 @@ class BBoxPaste : public StatelessOperator<Backend> {
  protected:
   bool use_ltrb_ = false;
 
+  bool HasContiguousOutputs() const override {
+    return false;
+  }
+
   bool SetupImpl(std::vector<OutputDesc> &output_desc, const Workspace &ws) override {
     return false;
   }
diff --git a/dali/operators/debug/dump_image.h b/dali/operators/debug/dump_image.h
index e80abdfb71..2a928313be 100644
--- a/dali/operators/debug/dump_image.h
+++ b/dali/operators/debug/dump_image.h
@@ -38,6 +38,10 @@ class DumpImage : public StatelessOperator<Backend> {
   inline ~DumpImage() override = default;
 
  protected:
+  bool HasContiguousOutputs() const override {
+    return false;
+  }
+
   bool SetupImpl(std::vector<OutputDesc> &output_desc, const Workspace &ws) override {
     return false;
   }
diff --git a/dali/operators/decoder/audio/audio_decoder_op.h b/dali/operators/decoder/audio/audio_decoder_op.h
index 5d78ae8eac..910630920e 100644
--- a/dali/operators/decoder/audio/audio_decoder_op.h
+++ b/dali/operators/decoder/audio/audio_decoder_op.h
@@ -59,10 +59,6 @@ class AudioDecoderCpu : public StatelessOperator<CPUBackend> {
   void RunImpl(Workspace &ws) override;
 
 
-  bool CanInferOutputs() const override {
-    return true;
-  }
-
 
  private:
   template<typename OutputType>
diff --git a/dali/operators/decoder/host/host_decoder.h b/dali/operators/decoder/host/host_decoder.h
index 621f78f2a1..f8464ffff7 100644
--- a/dali/operators/decoder/host/host_decoder.h
+++ b/dali/operators/decoder/host/host_decoder.h
@@ -36,6 +36,10 @@ class HostDecoder : public StatelessOperator<CPUBackend> {
   inline ~HostDecoder() override = default;
   DISABLE_COPY_MOVE_ASSIGN(HostDecoder);
 
+  bool HasContiguousOutputs() const override {
+    return false;
+  }
+
   bool SetupImpl(std::vector<OutputDesc> &output_desc, const Workspace &ws) override {
     return false;
   }
diff --git a/dali/operators/decoder/inflate/inflate.h b/dali/operators/decoder/inflate/inflate.h
index 33512474c7..d977dfa3d7 100644
--- a/dali/operators/decoder/inflate/inflate.h
+++ b/dali/operators/decoder/inflate/inflate.h
@@ -76,10 +76,6 @@ class Inflate : public StatelessOperator<Backend> {
       : StatelessOperator<Backend>(spec),
         alg_{inflate::parse_inflate_alg(spec.GetArgument<std::string>(inflate::algArgName))} {}
 
-  bool CanInferOutputs() const override {
-    return true;
-  }
-
  protected:
   void SetupOpImpl();
 
diff --git a/dali/operators/decoder/peek_shape/peek_image_shape.h b/dali/operators/decoder/peek_shape/peek_image_shape.h
index db33092f6e..66998d0f34 100644
--- a/dali/operators/decoder/peek_shape/peek_image_shape.h
+++ b/dali/operators/decoder/peek_shape/peek_image_shape.h
@@ -50,9 +50,6 @@ class PeekImageShape : public StatelessOperator<CPUBackend> {
       }
     }
   }
-  bool CanInferOutputs() const override {
-    return true;
-  }
 
  protected:
   bool SetupImpl(std::vector<OutputDesc> &output_desc, const Workspace &ws) override {
diff --git a/dali/operators/decoder/video/video_decoder_cpu.h b/dali/operators/decoder/video/video_decoder_cpu.h
index a75f535541..08a25bd7fa 100644
--- a/dali/operators/decoder/video/video_decoder_cpu.h
+++ b/dali/operators/decoder/video/video_decoder_cpu.h
@@ -31,10 +31,6 @@ class VideoDecoderCpu
   explicit VideoDecoderCpu(const OpSpec &spec) : Operator<CPUBackend>(spec) {}
 
 
-  bool CanInferOutputs() const override {
-    return true;
-  }
-
 
   bool SetupImpl(std::vector<OutputDesc> &output_desc, const Workspace &ws) override;
 
diff --git a/dali/operators/decoder/video/video_decoder_mixed.h b/dali/operators/decoder/video/video_decoder_mixed.h
index e640c405f3..1c70462cf5 100644
--- a/dali/operators/decoder/video/video_decoder_mixed.h
+++ b/dali/operators/decoder/video/video_decoder_mixed.h
@@ -38,10 +38,6 @@ class VideoDecoderMixed
                  "mixed video decoder") {}
 
 
-  bool CanInferOutputs() const override {
-    return true;
-  }
-
 
   void RunImpl(Workspace &ws) override;
 
diff --git a/dali/operators/generic/cast.h b/dali/operators/generic/cast.h
index 5c2063dbd0..6fd955bf12 100644
--- a/dali/operators/generic/cast.h
+++ b/dali/operators/generic/cast.h
@@ -48,10 +48,6 @@ class Cast : public StatelessOperator<Backend> {
   DISABLE_COPY_MOVE_ASSIGN(Cast);
 
  protected:
-  bool CanInferOutputs() const override {
-    return true;
-  }
-
   bool SetupImpl(std::vector<OutputDesc> &output_desc, const Workspace &ws) override {
     const auto &input = ws.Input<Backend>(0);
     DALIDataType out_type = is_cast_like_ ?  ws.GetInputDataType(1) : dtype_arg_;
diff --git a/dali/operators/generic/constant.h b/dali/operators/generic/constant.h
index 3b6f86d77e..613c132b53 100644
--- a/dali/operators/generic/constant.h
+++ b/dali/operators/generic/constant.h
@@ -72,9 +72,8 @@ class Constant : public StatelessOperator<Backend> {
     }
   }
 
-  bool CanInferOutputs() const override {
-    // Return false, because we specifically don't want the executor to allocate
-    // the storage for the output - even though we can infer the shape.
+  bool HasContiguousOutputs() const override {
+    // The output is not contiguous, because we repeat one sample.
     return false;
   }
 
@@ -87,7 +86,7 @@ class Constant : public StatelessOperator<Backend> {
     output_shape_ = max_output_shape_;
     output_shape_.resize(ws.GetRequestedBatchSize(0));
     output_desc[0] = {output_shape_, output_type_};
-    return false;
+    return false;  // do not allocate outputs
   }
 
   void RunImpl(Workspace &ws) override;
diff --git a/dali/operators/generic/constant_value.h b/dali/operators/generic/constant_value.h
index e6b57eeee6..d6cdf1fce5 100644
--- a/dali/operators/generic/constant_value.h
+++ b/dali/operators/generic/constant_value.h
@@ -47,7 +47,6 @@ class ConstantValue : public StatelessOperator<Backend> {
       return ws.GetRequestedBatchSize(0);
   }
 
-  bool CanInferOutputs() const override { return true; }
 
   bool CanBroadcastShapes(span<int64_t> shape1, span<int64_t> shape2) {
     size_t len1 = shape1.size();
diff --git a/dali/operators/generic/erase/erase.h b/dali/operators/generic/erase/erase.h
index 1642527478..d28783b0b1 100644
--- a/dali/operators/generic/erase/erase.h
+++ b/dali/operators/generic/erase/erase.h
@@ -41,10 +41,6 @@ class Erase : public StatelessOperator<Backend> {
   void RunImpl(Workspace &ws) override;
   bool SetupImpl(std::vector<OutputDesc> &output_desc, const Workspace &ws) override;
 
-  bool CanInferOutputs() const override {
-    return true;
-  }
-
   USE_OPERATOR_MEMBERS();
 
   std::unique_ptr<OpImplBase<Backend>> impl_;
diff --git a/dali/operators/generic/flip.h b/dali/operators/generic/flip.h
index c8ea13948c..eb46ad0326 100644
--- a/dali/operators/generic/flip.h
+++ b/dali/operators/generic/flip.h
@@ -40,10 +40,6 @@ class Flip: public StatelessOperator<Backend> {
     return true;
   }
 
-  bool CanInferOutputs() const override {
-    return true;
-  }
-
   void RunImpl(legacy_workspace_t<Backend> &ws) override;
 
   int GetHorizontal(const ArgumentWorkspace &ws, int idx) {
diff --git a/dali/operators/generic/join.h b/dali/operators/generic/join.h
index 163d9c277f..e73560d623 100644
--- a/dali/operators/generic/join.h
+++ b/dali/operators/generic/join.h
@@ -48,7 +48,6 @@ class TensorJoin : public StatelessOperator<Backend> {
 
   using Storage = detail::storage_tag_map_t<Backend>;
 
-  bool CanInferOutputs() const override { return true; }
   void RunImpl(Workspace &ws) override;
   bool SetupImpl(vector<OutputDesc> &outputs, const Workspace &ws) override;
 
diff --git a/dali/operators/generic/lookup_table.h b/dali/operators/generic/lookup_table.h
index c4dab09657..c97c70d5d3 100644
--- a/dali/operators/generic/lookup_table.h
+++ b/dali/operators/generic/lookup_table.h
@@ -98,10 +98,6 @@ class LookupTable : public StatelessOperator<Backend> {
   DISABLE_COPY_MOVE_ASSIGN(LookupTable);
 
  protected:
-  bool CanInferOutputs() const override {
-    return true;
-  }
-
   bool SetupImpl(std::vector<OutputDesc> &output_desc, const Workspace &ws) override {
     if (std::is_same<Backend, GPUBackend>::value && !lut_.shape().num_elements()) {
       TYPE_SWITCH(output_type_, dali::type2id, OutputType, LUT_OUT_TYPES, (
diff --git a/dali/operators/generic/one_hot.h b/dali/operators/generic/one_hot.h
index b6a7674d58..7931fb3819 100644
--- a/dali/operators/generic/one_hot.h
+++ b/dali/operators/generic/one_hot.h
@@ -76,10 +76,6 @@ class OneHot : public StatelessOperator<Backend> {
   USE_OPERATOR_MEMBERS();
 
  protected:
-  bool CanInferOutputs() const override {
-    return true;
-  }
-
   bool SetupImpl(std::vector<OutputDesc> &output_desc, const Workspace &ws) override {
     const auto &input = ws.Input<Backend>(0);
     int input_sample_dim = input.shape().sample_dim();
diff --git a/dali/operators/generic/pad.h b/dali/operators/generic/pad.h
index ce776bf038..b558d94791 100644
--- a/dali/operators/generic/pad.h
+++ b/dali/operators/generic/pad.h
@@ -51,10 +51,6 @@ class Pad : public StatelessOperator<Backend> {
   using Operator<Backend>::RunImpl;
   void RunImpl(Workspace &ws) override;
 
-  bool CanInferOutputs() const override {
-    return true;
-  }
-
  private:
   void ReadArguments(const OpSpec &spec, const Workspace &ws) {
     const auto &input = ws.Input<Backend>(0);
diff --git a/dali/operators/generic/permute_batch.h b/dali/operators/generic/permute_batch.h
index d46ee138b6..07141005c2 100644
--- a/dali/operators/generic/permute_batch.h
+++ b/dali/operators/generic/permute_batch.h
@@ -64,10 +64,6 @@ class PermuteBatchBase : public StatelessOperator<Backend> {
     return true;
   }
 
-  bool CanInferOutputs() const override {
-    return true;
-  }
-
 
  protected:
   vector<int> indices_;
diff --git a/dali/operators/generic/reduce/reduce.h b/dali/operators/generic/reduce/reduce.h
index ad56b21391..3d56ed1d9a 100644
--- a/dali/operators/generic/reduce/reduce.h
+++ b/dali/operators/generic/reduce/reduce.h
@@ -44,7 +44,6 @@ class Reduce : public StatelessOperator<Backend>, AxesHelper {
     spec.TryGetArgument<DALIDataType>(output_type_, "dtype");
   }
 
-  bool CanInferOutputs() const override { return true; }
 
   inline ~Reduce() override = default;
 
diff --git a/dali/operators/generic/reduce/reduce_with_mean_input.h b/dali/operators/generic/reduce/reduce_with_mean_input.h
index ba04a9b804..50322a46dd 100644
--- a/dali/operators/generic/reduce/reduce_with_mean_input.h
+++ b/dali/operators/generic/reduce/reduce_with_mean_input.h
@@ -45,7 +45,6 @@ class ReduceWithMeanInput : public StatelessOperator<Backend>, AxesHelper {
     ddof_(spec.GetArgument<int>("ddof")) {
   }
 
-  bool CanInferOutputs() const override { return true; }
 
   inline ~ReduceWithMeanInput() override = default;
 
diff --git a/dali/operators/generic/reshape.h b/dali/operators/generic/reshape.h
index 6a9e696be9..601954c689 100644
--- a/dali/operators/generic/reshape.h
+++ b/dali/operators/generic/reshape.h
@@ -32,9 +32,8 @@ class Reshape : public StatelessOperator<Backend> {
 
   explicit Reshape(const OpSpec &spec_);
 
-  bool CanInferOutputs() const override {
-    // Return false, because we specifically don't want the executor to allocate
-    // the storage for the output - even though we can infer the shape.
+  bool HasContiguousOutputs() const override {
+    // The contiguity depends on the source operator's output
     return false;
   }
 
diff --git a/dali/operators/generic/resize/tensor_resize.h b/dali/operators/generic/resize/tensor_resize.h
index 8d277195c7..c6af1815d1 100644
--- a/dali/operators/generic/resize/tensor_resize.h
+++ b/dali/operators/generic/resize/tensor_resize.h
@@ -41,7 +41,6 @@ class TensorResize : public StatelessOperator<Backend>
   int NumSpatialDims() const { return spatial_ndim_; }
   int FirstSpatialDim() const { return first_spatial_dim_; }
 
-  bool CanInferOutputs() const override { return true; }
 
   bool SetupImpl(std::vector<OutputDesc> &output_desc, const Workspace &ws) override;
 
diff --git a/dali/operators/generic/roi_random_crop.cc b/dali/operators/generic/roi_random_crop.cc
index 81dd7a69d2..68998fe4ef 100644
--- a/dali/operators/generic/roi_random_crop.cc
+++ b/dali/operators/generic/roi_random_crop.cc
@@ -71,7 +71,6 @@ bounds of the input.
 class ROIRandomCropCPU : public rng::OperatorWithRng<CPUBackend> {
  public:
   explicit ROIRandomCropCPU(const OpSpec &spec);
-  bool CanInferOutputs() const override { return true; }
   bool SetupImpl(std::vector<OutputDesc> &output_desc, const Workspace &ws) override;
   void RunImpl(Workspace &ws) override;
 
diff --git a/dali/operators/generic/shapes.h b/dali/operators/generic/shapes.h
index 14bf9e424a..d490e9c726 100644
--- a/dali/operators/generic/shapes.h
+++ b/dali/operators/generic/shapes.h
@@ -47,7 +47,6 @@ class Shapes : public StatelessOperator<Backend> {
       }
     }
   }
-  bool CanInferOutputs() const override { return true; }
 
   bool SetupImpl(std::vector<OutputDesc> &output_desc, const Workspace &ws) override {
     output_desc.resize(1);
diff --git a/dali/operators/generic/slice/slice_base.h b/dali/operators/generic/slice/slice_base.h
index b3872f07a2..60ab741da6 100644
--- a/dali/operators/generic/slice/slice_base.h
+++ b/dali/operators/generic/slice/slice_base.h
@@ -82,10 +82,6 @@ class SliceBase : public StatelessOperator<Backend> {
    */
   virtual void ProcessCroppingAttrs(const OpSpec &spec, const Workspace &ws) = 0;
   virtual const CropWindowGenerator &GetCropWindowGenerator(std::size_t data_idx) const = 0;
-
-  bool CanInferOutputs() const override {
-    return true;
-  }
   bool SetupImpl(std::vector<OutputDesc> &output_desc, const Workspace &ws) override;
   void RunImpl(Workspace &ws) override;
 
diff --git a/dali/operators/generic/slice/subscript.h b/dali/operators/generic/slice/subscript.h
index 62c68022bf..79f75f2ca5 100644
--- a/dali/operators/generic/slice/subscript.h
+++ b/dali/operators/generic/slice/subscript.h
@@ -330,7 +330,6 @@ class TensorSubscript : public StatelessOperator<Backend> {
     return out_layout;
   }
 
-  bool CanInferOutputs() const override { return true; }
 
   using StatelessOperator<Backend>::RunImpl;
   void RunImpl(Workspace &ws) override {
diff --git a/dali/operators/generic/transpose/transpose.h b/dali/operators/generic/transpose/transpose.h
index 37a448c784..cbe92928c9 100644
--- a/dali/operators/generic/transpose/transpose.h
+++ b/dali/operators/generic/transpose/transpose.h
@@ -104,10 +104,6 @@ class Transpose : public StatelessOperator<Backend> {
     return true;
   }
 
-  bool CanInferOutputs() const override {
-    return true;
-  }
-
  protected:
   bool transpose_layout_;
   TensorLayout output_layout_arg_;
diff --git a/dali/operators/generic/transpose/transpose_gpu.cc b/dali/operators/generic/transpose/transpose_gpu.cc
index 65ce4ccc5c..c1192ca8bc 100644
--- a/dali/operators/generic/transpose/transpose_gpu.cc
+++ b/dali/operators/generic/transpose/transpose_gpu.cc
@@ -32,10 +32,6 @@ class TransposeGPU : public Transpose<GPUBackend> {
     kmgr_.Resize<Kernel>(1);
   }
 
-  bool CanInferOutputs() const override {
-    return true;
-  }
-
  protected:
   bool SetupImpl(vector<OutputDesc> &descs, const Workspace &ws) override {
     Transpose<GPUBackend>::SetupImpl(descs, ws);
diff --git a/dali/operators/geometry/affine_transforms/combine_transforms.cc b/dali/operators/geometry/affine_transforms/combine_transforms.cc
index f7373ccd13..7bd6fe5228 100644
--- a/dali/operators/geometry/affine_transforms/combine_transforms.cc
+++ b/dali/operators/geometry/affine_transforms/combine_transforms.cc
@@ -58,7 +58,6 @@ class CombineTransformsCPU : public SequenceOperator<CPUBackend, StatelessOperat
       reverse_order_(spec.GetArgument<bool>("reverse_order")) {
   }
 
-  bool CanInferOutputs() const override { return true; }
 
  protected:
   bool SetupImpl(std::vector<OutputDesc> &output_descs,
diff --git a/dali/operators/geometry/affine_transforms/transform_base_op.h b/dali/operators/geometry/affine_transforms/transform_base_op.h
index 05c991eae3..da05aea4be 100644
--- a/dali/operators/geometry/affine_transforms/transform_base_op.h
+++ b/dali/operators/geometry/affine_transforms/transform_base_op.h
@@ -55,7 +55,6 @@ class TransformBaseOp : public SequenceOperator<Backend, StatelessOperator, true
     matrix_data_.set_type(dtype_);
   }
 
-  bool CanInferOutputs() const override { return true; }
 
   TransformImpl &This() noexcept { return static_cast<TransformImpl&>(*this); }
   const TransformImpl &This() const noexcept { return static_cast<const TransformImpl&>(*this); }
diff --git a/dali/operators/geometry/coord_flip.h b/dali/operators/geometry/coord_flip.h
index 051a2cee59..259700ec68 100644
--- a/dali/operators/geometry/coord_flip.h
+++ b/dali/operators/geometry/coord_flip.h
@@ -35,10 +35,6 @@ class CoordFlip : public StatelessOperator<Backend> {
   DISABLE_COPY_MOVE_ASSIGN(CoordFlip);
 
  protected:
-  bool CanInferOutputs() const override {
-    return true;
-  }
-
   bool SetupImpl(std::vector<OutputDesc> &output_desc, const Workspace &ws) override {
     const auto &input = ws.Input<Backend>(0);
     DALI_ENFORCE(input.type() == DALI_FLOAT, "Input is expected to be float");
diff --git a/dali/operators/geometry/coord_transform.h b/dali/operators/geometry/coord_transform.h
index 8a419f400c..6bd7385644 100644
--- a/dali/operators/geometry/coord_transform.h
+++ b/dali/operators/geometry/coord_transform.h
@@ -39,7 +39,6 @@ class CoordTransform : public SequenceOperator<Backend, StatelessOperator, true>
     dtype_ = spec_.template GetArgument<DALIDataType>("dtype");
   }
 
-  bool CanInferOutputs() const override { return true; }
 
  protected:
   using Base::spec_;
diff --git a/dali/operators/image/color/brightness_contrast.h b/dali/operators/image/color/brightness_contrast.h
index ceb5efe5bd..c358be0940 100644
--- a/dali/operators/image/color/brightness_contrast.h
+++ b/dali/operators/image/color/brightness_contrast.h
@@ -68,10 +68,6 @@ class BrightnessContrastOp : public SequenceOperator<Backend, StatelessOperator>
     spec.TryGetArgument(output_type_arg_, "dtype");
   }
 
-  bool CanInferOutputs() const override {
-    return true;
-  }
-
   // The operator needs 4 dim path for DHWC data, so use it to avoid inflating
   // the number of samples and parameters unnecessarily for FHWC when there are no
   // per-frame parameters provided.
diff --git a/dali/operators/image/color/color_space_conversion.h b/dali/operators/image/color/color_space_conversion.h
index dfdd76a444..3e7c287d0c 100644
--- a/dali/operators/image/color/color_space_conversion.h
+++ b/dali/operators/image/color/color_space_conversion.h
@@ -35,8 +35,6 @@ class ColorSpaceConversion : public StatelessOperator<Backend> {
   }
 
  protected:
-  bool CanInferOutputs() const override { return true; }
-
   bool SetupImpl(std::vector<OutputDesc> &output_desc,
                  const Workspace &ws) override {
     output_desc.resize(1);
diff --git a/dali/operators/image/color/color_twist.h b/dali/operators/image/color/color_twist.h
index 8ddbf153e3..e689049ea5 100644
--- a/dali/operators/image/color/color_twist.h
+++ b/dali/operators/image/color/color_twist.h
@@ -100,10 +100,6 @@ class ColorTwistBase : public SequenceOperator<Backend, StatelessOperator> {
     spec.TryGetArgument(output_type_arg_, color::kOutputType);
   }
 
-  bool CanInferOutputs() const override {
-    return true;
-  }
-
   // The operator needs 4 dim path for DHWC data, so use it to avoid inflating
   // the number of samples and parameters unnecessarily for FHWC when there are no
   // per-frame parameters provided.
diff --git a/dali/operators/image/color/debayer.h b/dali/operators/image/color/debayer.h
index 365b1c9415..449ffc3a0e 100644
--- a/dali/operators/image/color/debayer.h
+++ b/dali/operators/image/color/debayer.h
@@ -127,10 +127,6 @@ class Debayer : public SequenceOperator<Backend, StatelessOperator> {
     return true;
   }
 
-  bool CanInferOutputs() const override {
-    return true;
-  }
-
   void AcquirePatternArgument(const Workspace &ws, int batch_size) {
     if (!spec_.HasTensorArgument(debayer::kBluePosArgName)) {
       pattern_.resize(batch_size, static_pattern_);
diff --git a/dali/operators/image/color/equalize.h b/dali/operators/image/color/equalize.h
index b275b9869b..b9c59fda48 100644
--- a/dali/operators/image/color/equalize.h
+++ b/dali/operators/image/color/equalize.h
@@ -47,10 +47,6 @@ class Equalize : public SequenceOperator<Backend, StatelessOperator> {
     return true;
   }
 
-  bool CanInferOutputs() const override {
-    return true;
-  }
-
   bool ShouldExpandChannels(int input_idx) const override {
     (void)input_idx;
     return true;
diff --git a/dali/operators/image/convolution/filter.h b/dali/operators/image/convolution/filter.h
index 0215ee6cf6..f2cc523a74 100644
--- a/dali/operators/image/convolution/filter.h
+++ b/dali/operators/image/convolution/filter.h
@@ -182,10 +182,6 @@ class Filter : public SequenceOperator<Backend, StatelessOperator> {
   DISABLE_COPY_MOVE_ASSIGN(Filter);
 
  protected:
-  bool CanInferOutputs() const override {
-    return true;
-  }
-
   bool ShouldExpand(const Workspace& ws) override {
     const auto& input_layout = ws.GetInputLayout(0);
     int frame_idx = VideoLayoutInfo::FrameDimIndex(input_layout);
diff --git a/dali/operators/image/convolution/gaussian_blur.h b/dali/operators/image/convolution/gaussian_blur.h
index f54aa15f89..d2285b5136 100644
--- a/dali/operators/image/convolution/gaussian_blur.h
+++ b/dali/operators/image/convolution/gaussian_blur.h
@@ -50,10 +50,6 @@ class GaussianBlur : public SequenceOperator<Backend, StatelessOperator> {
   DISABLE_COPY_MOVE_ASSIGN(GaussianBlur);
 
  protected:
-  bool CanInferOutputs() const override {
-    return true;
-  }
-
   bool ShouldExpandChannels(int input_idx) const override {
     (void)input_idx;
     return true;
diff --git a/dali/operators/image/convolution/laplacian.h b/dali/operators/image/convolution/laplacian.h
index 0230031564..a3ca82136d 100644
--- a/dali/operators/image/convolution/laplacian.h
+++ b/dali/operators/image/convolution/laplacian.h
@@ -51,10 +51,6 @@ class Laplacian : public SequenceOperator<Backend, StatelessOperator> {
   DISABLE_COPY_MOVE_ASSIGN(Laplacian);
 
  protected:
-  bool CanInferOutputs() const override {
-    return true;
-  }
-
   bool ShouldExpandChannels(int input_idx) const override {
     (void)input_idx;
     return true;
diff --git a/dali/operators/image/crop/crop_mirror_normalize.h b/dali/operators/image/crop/crop_mirror_normalize.h
index 992576b310..3b9d19f1af 100755
--- a/dali/operators/image/crop/crop_mirror_normalize.h
+++ b/dali/operators/image/crop/crop_mirror_normalize.h
@@ -118,10 +118,6 @@ class CropMirrorNormalize : public StatelessOperator<Backend> {
 
   void RunImpl(Workspace &ws) override;
 
-  bool CanInferOutputs() const override {
-    return true;
-  }
-
   void ProcessNormArgs(int sample_idx) {
     span<const float> mean_arg(mean_arg_[sample_idx].data, mean_arg_[sample_idx].num_elements());
     span<const float> std_arg(std_arg_[sample_idx].data, std_arg_[sample_idx].num_elements());
diff --git a/dali/operators/image/crop/new_crop_mirror_normalize.cu b/dali/operators/image/crop/new_crop_mirror_normalize.cu
index df5c4cbb5d..78c46ea8c8 100644
--- a/dali/operators/image/crop/new_crop_mirror_normalize.cu
+++ b/dali/operators/image/crop/new_crop_mirror_normalize.cu
@@ -325,10 +325,6 @@ class NewCropMirrorNormalizeGPU : public StatelessOperator<GPUBackend> {
     RunSliceHwc2HwcChwNormalize(ws);
   }
 
-  bool CanInferOutputs() const override {
-    return true;
-  }
-
 
   /**
    * @brief Compute the 2D ROI for given sample_idx, crop_attr_ must be update first.
diff --git a/dali/operators/image/crop/random_crop_generator.h b/dali/operators/image/crop/random_crop_generator.h
index ad17eeb8c0..2414eb3706 100644
--- a/dali/operators/image/crop/random_crop_generator.h
+++ b/dali/operators/image/crop/random_crop_generator.h
@@ -45,7 +45,6 @@ class RandomCropGeneratorOp : public Operator<Backend> {
   USE_OPERATOR_MEMBERS();
   using Operator<Backend>::RunImpl;
 
-  bool CanInferOutputs() const override { return true; }
 
  protected:
   bool SetupImpl(std::vector<OutputDesc> &output_desc, const Workspace &ws) override {
diff --git a/dali/operators/image/distortion/jpeg_compression_distortion_op.h b/dali/operators/image/distortion/jpeg_compression_distortion_op.h
index 149ae930de..19d00a8aba 100644
--- a/dali/operators/image/distortion/jpeg_compression_distortion_op.h
+++ b/dali/operators/image/distortion/jpeg_compression_distortion_op.h
@@ -34,10 +34,6 @@ class JpegCompressionDistortion : public StatelessOperator<Backend> {
         quality_arg_("quality", spec) {
   }
 
-  bool CanInferOutputs() const override {
-    return true;
-  }
-
   bool SetupImpl(std::vector<OutputDesc> &output_desc, const Workspace &ws) override {
     const auto &input = ws.Input<Backend>(0);
     output_desc.resize(1);
diff --git a/dali/operators/image/filter/median_blur.cc b/dali/operators/image/filter/median_blur.cc
index 668e306102..2be2bb1b42 100644
--- a/dali/operators/image/filter/median_blur.cc
+++ b/dali/operators/image/filter/median_blur.cc
@@ -55,10 +55,6 @@ class MedianBlur : public nvcvop::NVCVSequenceOperator<StatelessOperator> {
     return true;
   }
 
-  bool CanInferOutputs() const override {
-    return true;
-  }
-
   bool SetupImpl(std::vector<OutputDesc> &output_desc, const Workspace &ws) override {
     const auto &input = ws.Input<GPUBackend>(0);
     auto sh = input.shape();
diff --git a/dali/operators/image/mask/grid_mask.h b/dali/operators/image/mask/grid_mask.h
index 174bdc1e0d..a672d323b3 100644
--- a/dali/operators/image/mask/grid_mask.h
+++ b/dali/operators/image/mask/grid_mask.h
@@ -29,7 +29,6 @@ class GridMask : public StatelessOperator<Backend> {
   explicit GridMask(const OpSpec &spec) : StatelessOperator<Backend>(spec) { }
 
  protected:
-  bool CanInferOutputs() const override { return true; }
   void GetArguments(const Workspace &ws) {
     int batch_size = ws.GetInputBatchSize(0);
     this->GetPerSampleArgument(tile_, "tile", ws, batch_size);
diff --git a/dali/operators/image/morphology/morphology.h b/dali/operators/image/morphology/morphology.h
index a16a6ad842..fe57696bd2 100644
--- a/dali/operators/image/morphology/morphology.h
+++ b/dali/operators/image/morphology/morphology.h
@@ -41,10 +41,6 @@ class Morphology : public nvcvop::NVCVSequenceOperator<StatelessOperator> {
       DALI_ENFORCE(iteration_ >= 1, "iterations must be >= 1");
     }
 
-  bool CanInferOutputs() const override {
-    return true;
-  }
-
   bool SetupImpl(std::vector<OutputDesc> &output_desc, const Workspace &ws) override;
 
   bool ShouldExpandChannels(int input_idx) const override {
diff --git a/dali/operators/image/paste/multipaste.h b/dali/operators/image/paste/multipaste.h
index 9906c36eb4..73b7d54437 100644
--- a/dali/operators/image/paste/multipaste.h
+++ b/dali/operators/image/paste/multipaste.h
@@ -111,10 +111,6 @@ class MultiPasteOp : public SequenceOperator<Backend, StatelessOperator> {
     }
   }
 
-  bool CanInferOutputs() const override {
-    return true;
-  }
-
   bool Intersects(ivec2 anchors1, ivec2 shapes1,
                   ivec2 anchors2, ivec2 shapes2) const {
     for (int i = 0; i < 2; i++) {
diff --git a/dali/operators/image/remap/cvcuda/warp_perspective.cc b/dali/operators/image/remap/cvcuda/warp_perspective.cc
index 1f646e3311..bc258b89e2 100644
--- a/dali/operators/image/remap/cvcuda/warp_perspective.cc
+++ b/dali/operators/image/remap/cvcuda/warp_perspective.cc
@@ -101,10 +101,6 @@ class WarpPerspective : public nvcvop::NVCVSequenceOperator<StatelessOperator> {
     return true;
   }
 
-  bool CanInferOutputs() const override {
-    return true;
-  }
-
   float4 GetFillValue(int channels) const {
     if (fill_value_arg_.size() > 1) {
       if (channels > 0) {
diff --git a/dali/operators/image/remap/displacement_filter_impl_cpu.h b/dali/operators/image/remap/displacement_filter_impl_cpu.h
index 0d9b8efce9..905b05ba0a 100644
--- a/dali/operators/image/remap/displacement_filter_impl_cpu.h
+++ b/dali/operators/image/remap/displacement_filter_impl_cpu.h
@@ -143,10 +143,6 @@ class DisplacementFilter<CPUBackend, Displacement, per_channel_transform>
     }
   }
 
-  bool CanInferOutputs() const override {
-    return true;
-  }
-
   bool SetupImpl(std::vector<OutputDesc> &output_desc, const Workspace &ws) override {
     const auto &input = ws.Input<CPUBackend>(0);
     output_desc.resize(1);
diff --git a/dali/operators/image/remap/displacement_filter_impl_gpu.cuh b/dali/operators/image/remap/displacement_filter_impl_gpu.cuh
index 9a7ee82259..6b130aa0c1 100644
--- a/dali/operators/image/remap/displacement_filter_impl_gpu.cuh
+++ b/dali/operators/image/remap/displacement_filter_impl_gpu.cuh
@@ -240,10 +240,6 @@ class DisplacementFilter<GPUBackend, Displacement, per_channel_transform>
      displace_.Cleanup();
   }
 
-  bool CanInferOutputs() const override {
-    return true;
-  }
-
   bool SetupImpl(std::vector<OutputDesc> &output_desc, const Workspace &ws) override {
     const auto &input = ws.Input<GPUBackend>(0);
     output_desc.resize(1);
diff --git a/dali/operators/image/remap/remap.h b/dali/operators/image/remap/remap.h
index 7a83336c0c..1f9650c8c8 100644
--- a/dali/operators/image/remap/remap.h
+++ b/dali/operators/image/remap/remap.h
@@ -46,11 +46,6 @@ class Remap : public SequenceOperator<Backend, StatelessOperator> {
   DISABLE_COPY_MOVE_ASSIGN(Remap);
 
  protected:
-  bool CanInferOutputs() const override {
-    return true;
-  }
-
-
   bool SetupImpl(std::vector<OutputDesc> &output_desc, const Workspace &ws) override {
     const auto &input = ws.template Input<Backend>(0);
 
diff --git a/dali/operators/image/remap/warp.h b/dali/operators/image/remap/warp.h
index ecbae78da3..b8d7c2b7ff 100644
--- a/dali/operators/image/remap/warp.h
+++ b/dali/operators/image/remap/warp.h
@@ -277,10 +277,6 @@ class Warp : public SequenceOperator<Backend, StatelessOperator> {
     return border_clamp_;
   }
 
-  bool CanInferOutputs() const override {
-    return true;
-  }
-
   bool SetupImpl(std::vector<OutputDesc> &outputs, const Workspace &ws) override {
     outputs.resize(1);
 
diff --git a/dali/operators/image/resize/experimental/resize.h b/dali/operators/image/resize/experimental/resize.h
index ca390fb342..866b7c983b 100644
--- a/dali/operators/image/resize/experimental/resize.h
+++ b/dali/operators/image/resize/experimental/resize.h
@@ -59,10 +59,6 @@ class CvCudaResize : public StatelessOperator<GPUBackend>, protected ResizeBase<
     return resize_attr_.first_spatial_dim_;
   }
 
-  bool CanInferOutputs() const override {
-    return true;
-  }
-
   bool SetupImpl(std::vector<OutputDesc> &output_desc, const Workspace &ws) override;
 
   void RunImpl(Workspace &ws) override;
diff --git a/dali/operators/image/resize/random_resized_crop.h b/dali/operators/image/resize/random_resized_crop.h
index 6fba14b200..f799d80202 100644
--- a/dali/operators/image/resize/random_resized_crop.h
+++ b/dali/operators/image/resize/random_resized_crop.h
@@ -50,7 +50,6 @@ class RandomResizedCrop : public Operator<Backend>
   USE_OPERATOR_MEMBERS();
   using Operator<Backend>::RunImpl;
 
-  bool CanInferOutputs() const override { return true; }
 
  protected:
   bool SetupImpl(std::vector<OutputDesc> &output_desc, const Workspace &ws) override {
diff --git a/dali/operators/image/resize/resize.h b/dali/operators/image/resize/resize.h
index 734cabd125..5617c3ee74 100644
--- a/dali/operators/image/resize/resize.h
+++ b/dali/operators/image/resize/resize.h
@@ -41,7 +41,6 @@ class Resize : public StatelessOperator<Backend>
   int NumSpatialDims() const { return resize_attr_.spatial_ndim_; }
   int FirstSpatialDim() const { return resize_attr_.first_spatial_dim_; }
 
-  bool CanInferOutputs() const override { return true; }
 
   bool SetupImpl(std::vector<OutputDesc> &output_desc, const Workspace &ws) override;
 
diff --git a/dali/operators/image/resize/resize_crop_mirror.h b/dali/operators/image/resize/resize_crop_mirror.h
index a17a65e236..b5fa9a079a 100755
--- a/dali/operators/image/resize/resize_crop_mirror.h
+++ b/dali/operators/image/resize/resize_crop_mirror.h
@@ -56,7 +56,6 @@ class ResizeCropMirror : public StatelessOperator<Backend>
   int NumSpatialDims() const { return resize_attr_.spatial_ndim_; }
   int FirstSpatialDim() const { return resize_attr_.first_spatial_dim_; }
 
-  bool CanInferOutputs() const override { return true; }
 
   bool SetupImpl(std::vector<OutputDesc> &output_desc, const Workspace &ws) override;
 
diff --git a/dali/operators/imgcodec/image_decoder.h b/dali/operators/imgcodec/image_decoder.h
index 6b330909b8..aed89353e4 100644
--- a/dali/operators/imgcodec/image_decoder.h
+++ b/dali/operators/imgcodec/image_decoder.h
@@ -468,10 +468,6 @@ class ImageDecoder : public StatelessOperator<Backend> {
       decoder_params_["jpeg_fancy_upsampling"] = false;
   }
 
-  bool CanInferOutputs() const override {
-    return true;
-  }
-
   void ParseSample(ParsedSample &parsed_sample, span<const uint8_t> encoded) {
     parsed_sample.encoded_stream =
         NvImageCodecCodeStream::FromHostMem(instance_, encoded.data(), encoded.size());
diff --git a/dali/operators/imgcodec/peek_image_shape.cc b/dali/operators/imgcodec/peek_image_shape.cc
index 16bd5e4642..4f7c7b39c3 100644
--- a/dali/operators/imgcodec/peek_image_shape.cc
+++ b/dali/operators/imgcodec/peek_image_shape.cc
@@ -69,10 +69,6 @@ ImgcodecPeekImageShape::ImgcodecPeekImageShape(const OpSpec &spec)
   instance_ = NvImageCodecInstance::Create(&instance_create_info);
 }
 
-bool ImgcodecPeekImageShape::CanInferOutputs() const {
-  return true;
-}
-
 bool ImgcodecPeekImageShape::SetupImpl(std::vector<OutputDesc> &output_desc,
                                        const Workspace &ws) {
   const auto &input = ws.template Input<CPUBackend>(0);
diff --git a/dali/operators/imgcodec/peek_image_shape.h b/dali/operators/imgcodec/peek_image_shape.h
index 498975585e..54bd00ab21 100644
--- a/dali/operators/imgcodec/peek_image_shape.h
+++ b/dali/operators/imgcodec/peek_image_shape.h
@@ -33,8 +33,6 @@ class ImgcodecPeekImageShape : public StatelessOperator<CPUBackend> {
 
   explicit ImgcodecPeekImageShape(const OpSpec &spec);
 
-  bool CanInferOutputs() const override;
-
  protected:
   bool SetupImpl(std::vector<OutputDesc> &output_desc, const Workspace &ws) override;
 
diff --git a/dali/operators/input/video_input.h b/dali/operators/input/video_input.h
index 4bb865b9fe..9203956248 100644
--- a/dali/operators/input/video_input.h
+++ b/dali/operators/input/video_input.h
@@ -159,10 +159,6 @@ class VideoInput : public VideoDecoderBase<Backend, FramesDecoder>, public Input
   }
 
 
-  bool CanInferOutputs() const override {
-    return true;
-  }
-
 
   int NextBatchSize() override {
     return batch_size_;
diff --git a/dali/operators/io/file/file_read.cc b/dali/operators/io/file/file_read.cc
index 2b78664a31..4c7e14c6c3 100644
--- a/dali/operators/io/file/file_read.cc
+++ b/dali/operators/io/file/file_read.cc
@@ -26,7 +26,6 @@ class FileRead : public StatelessOperator<CPUBackend> {
         dont_use_mmap_(spec.GetArgument<bool>("dont_use_mmap")),
         use_o_direct_(spec.GetArgument<bool>("use_o_direct")) {}
 
-  bool CanInferOutputs() const override { return true; }
 
   bool SetupImpl(std::vector<OutputDesc> &output_descs, const Workspace &ws) override {
     const auto &filepaths = ws.Input<CPUBackend>(0);
diff --git a/dali/operators/math/expressions/arithmetic.h b/dali/operators/math/expressions/arithmetic.h
index a0192965e9..1f8f33a89a 100644
--- a/dali/operators/math/expressions/arithmetic.h
+++ b/dali/operators/math/expressions/arithmetic.h
@@ -334,10 +334,6 @@ class ArithmeticGenericOp : public StatelessOperator<Backend> {
   }
 
  protected:
-  bool CanInferOutputs() const override {
-    return true;
-  }
-
   bool SetupImpl(std::vector<OutputDesc> &output_desc, const Workspace &ws) override {
     output_desc.resize(1);
     for (int i = 1; i < ws.NumInput(); i++) {
diff --git a/dali/operators/math/normalize/normalize.h b/dali/operators/math/normalize/normalize.h
index 86530a1d84..9f96d98bc6 100644
--- a/dali/operators/math/normalize/normalize.h
+++ b/dali/operators/math/normalize/normalize.h
@@ -76,7 +76,6 @@ class NormalizeBase : public StatelessOperator<Backend> {
   const Normalize<Backend> &This() const noexcept
   { return static_cast<const Normalize<Backend>&>(*this); }
 
-  bool CanInferOutputs() const override { return true; }
 
   bool SetupImpl(std::vector<OutputDesc> &output_descs, const Workspace &ws) override {
     const auto &input = ws.Input<Backend>(0);
diff --git a/dali/operators/numba_function/numba_func.h b/dali/operators/numba_function/numba_func.h
index 1a1c28aff4..7ed34c76d8 100644
--- a/dali/operators/numba_function/numba_func.h
+++ b/dali/operators/numba_function/numba_func.h
@@ -69,8 +69,6 @@ class NumbaFuncImpl : public StatelessOperator<Backend> {
   explicit NumbaFuncImpl(const OpSpec &spec_);
 
  protected:
-  bool CanInferOutputs() const override { return true; }
-
   bool SetupImpl(std::vector<OutputDesc> &output_desc, const Workspace &ws) override;
 
   /**
diff --git a/dali/operators/python_function/dltensor_function.h b/dali/operators/python_function/dltensor_function.h
index 71a3a9bd32..001797b556 100644
--- a/dali/operators/python_function/dltensor_function.h
+++ b/dali/operators/python_function/dltensor_function.h
@@ -168,6 +168,10 @@ class DLTensorPythonFunctionImpl : public StatelessOperator<Backend> {
   }
 
  protected:
+  bool HasContiguousOutputs() const override {
+    return false;
+  }
+
   bool SetupImpl(std::vector<OutputDesc> &output_desc, const Workspace &ws) override {
     return false;
   }
diff --git a/dali/operators/python_function/jax_function.h b/dali/operators/python_function/jax_function.h
index 1ae33239d5..bc55fcf80a 100644
--- a/dali/operators/python_function/jax_function.h
+++ b/dali/operators/python_function/jax_function.h
@@ -149,6 +149,10 @@ class JaxFunction : public StatelessOperator<Backend> {
     python_function_.release();
   }
 
+  bool HasContiguousOutputs() const override {
+    return false;
+  }
+
   bool SetupImpl(std::vector<OutputDesc> &output_desc, const Workspace &ws) override {
     return false;
   }
diff --git a/dali/operators/random/batch_permutation.h b/dali/operators/random/batch_permutation.h
index 6883a19d9f..5ab742636f 100644
--- a/dali/operators/random/batch_permutation.h
+++ b/dali/operators/random/batch_permutation.h
@@ -34,7 +34,6 @@ class BatchPermutation : public rng::OperatorWithRng<CPUBackend, false> {
     return true;
   }
   void RunImpl(Workspace &ws) override;
-  bool CanInferOutputs() const override { return true; }
  private:
   void NoRepetitions(int N);
   void WithRepetitions(int N);
diff --git a/dali/operators/random/rng_base.h b/dali/operators/random/rng_base.h
index 1fa1051dc5..2c0ff4888c 100644
--- a/dali/operators/random/rng_base.h
+++ b/dali/operators/random/rng_base.h
@@ -154,10 +154,6 @@ class RNGBase : public OperatorWithRng<Backend> {
 
   /** @} */  // end of RngCRTP
 
-  bool CanInferOutputs() const override {
-    return true;
-  }
-
   int GetBatchSize(const Workspace &ws) const {
     if (spec_.NumRegularInput() == 1)
       return ws.Input<Backend>(0).shape().size();
diff --git a/dali/operators/random/uniform.h b/dali/operators/random/uniform.h
index 3a03fa8b88..993e8916c0 100644
--- a/dali/operators/random/uniform.h
+++ b/dali/operators/random/uniform.h
@@ -51,10 +51,6 @@ class Uniform : public Operator<CPUBackend> {
   using Operator<CPUBackend>::RunImpl;
 
  protected:
-  bool CanInferOutputs() const override {
-    return true;
-  }
-
   bool SetupImpl(std::vector<OutputDesc> &output_desc, const Workspace &ws) override {
     output_desc.resize(1);
     auto curr_batch_size = ws.GetRequestedBatchSize(0);
diff --git a/dali/operators/reader/file_reader_op.h b/dali/operators/reader/file_reader_op.h
index 893e3c3c44..4f13d04def 100644
--- a/dali/operators/reader/file_reader_op.h
+++ b/dali/operators/reader/file_reader_op.h
@@ -35,7 +35,7 @@ class FileReader : public DataReader<CPUBackend, ImageLabelWrapper, ImageLabelWr
     this->SetInitialSnapshot();
   }
 
-  bool CanInferOutputs() const override {
+  bool HasContiguousOutputs() const override {
     return true;
   }
 
diff --git a/dali/operators/reader/fits_reader_op.h b/dali/operators/reader/fits_reader_op.h
index b41b2c3e60..891b8f5f9f 100644
--- a/dali/operators/reader/fits_reader_op.h
+++ b/dali/operators/reader/fits_reader_op.h
@@ -31,7 +31,7 @@ class FitsReader : public DataReader<Backend, Target, Target, true> {
  public:
   explicit FitsReader(const OpSpec& spec) : DataReader<Backend, Target, Target, true>(spec) {}
 
-  bool CanInferOutputs() const override {
+  bool HasContiguousOutputs() const override {
     return true;
   }
 
diff --git a/dali/operators/reader/numpy_reader_op.h b/dali/operators/reader/numpy_reader_op.h
index 318c5c528e..f712c50a2c 100644
--- a/dali/operators/reader/numpy_reader_op.h
+++ b/dali/operators/reader/numpy_reader_op.h
@@ -44,7 +44,7 @@ class NumpyReader : public DataReader<Backend, Target, Target, true> {
     }
   }
 
-  bool CanInferOutputs() const override {
+  bool HasContiguousOutputs() const override {
     return true;
   }
 
diff --git a/dali/operators/reader/reader_op.h b/dali/operators/reader/reader_op.h
index f7dd4aae4f..247d26d216 100644
--- a/dali/operators/reader/reader_op.h
+++ b/dali/operators/reader/reader_op.h
@@ -174,6 +174,10 @@ class DataReader : public Operator<Backend> {
     }
   }
 
+  bool HasContiguousOutputs() const override {
+    return false;
+  }
+
   bool SetupImpl(std::vector<OutputDesc> &output_desc, const Workspace &ws) override {
     // If necessary start prefetching thread and wait for a consumable batch
     StartPrefetchThread();
diff --git a/dali/operators/reader/tfrecord_reader_op.h b/dali/operators/reader/tfrecord_reader_op.h
index 450ed5e892..4c2cbd453b 100644
--- a/dali/operators/reader/tfrecord_reader_op.h
+++ b/dali/operators/reader/tfrecord_reader_op.h
@@ -63,7 +63,7 @@ class TFRecordReader
     thread_pool.RunAll();
     // Propagate metadata from individual samples to the whole batch as working with SampleWorkspace
     // breaks metadata consistency - it sets it only to samples
-    FixBatchPropertiesConsistency(ws, CanInferOutputs());
+    FixBatchPropertiesConsistency(ws, HasContiguousOutputs());
   }
 
   ~TFRecordReader() override {
diff --git a/dali/operators/reader/video_reader_decoder_gpu_op.h b/dali/operators/reader/video_reader_decoder_gpu_op.h
index 64c24163e0..ca7fbd3fc0 100644
--- a/dali/operators/reader/video_reader_decoder_gpu_op.h
+++ b/dali/operators/reader/video_reader_decoder_gpu_op.h
@@ -29,7 +29,7 @@ class VideoReaderDecoderGpu : public DataReader<GPUBackend, VideoSampleGpu, Vide
 
   void RunImpl(Workspace &ws) override;
 
-  bool CanInferOutputs() const override { return true; }
+  bool HasContiguousOutputs() const override { return true; }
 
   void Prefetch() override;
 
diff --git a/dali/operators/reader/video_reader_op.cc b/dali/operators/reader/video_reader_op.cc
index 5b580bde2b..f06d0e1e74 100644
--- a/dali/operators/reader/video_reader_op.cc
+++ b/dali/operators/reader/video_reader_op.cc
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// Copyright (c) 2017-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -46,7 +46,7 @@ void VideoReader::Prefetch() {
   for (int data_idx = 0; data_idx < curr_tensor_list.num_samples(); ++data_idx) {
     auto &sample = curr_batch[data_idx];
     // TODO(klecki): Rework this with proper sample-based tensor batch data structure
-    auto sample_shared_ptr = unsafe_sample_owner(curr_tensor_list, data_idx);
+    auto &sample_shared_ptr = unsafe_sample_owner(curr_tensor_list, data_idx);
     sample->sequence.ShareData(sample_shared_ptr, curr_tensor_list.capacity(),
                                curr_tensor_list.is_pinned(), curr_tensor_list.shape()[data_idx],
                                curr_tensor_list.type(), curr_tensor_list.device_id(),
diff --git a/dali/operators/reader/webdataset_reader_op.h b/dali/operators/reader/webdataset_reader_op.h
index 9c5928657c..21a117de1f 100644
--- a/dali/operators/reader/webdataset_reader_op.h
+++ b/dali/operators/reader/webdataset_reader_op.h
@@ -32,7 +32,7 @@ class DLL_PUBLIC WebdatasetReader
 
   bool SetupImpl(std::vector<OutputDesc>& output_desc, const Workspace&) override;
   void RunImpl(Workspace &ws) override;
-  bool CanInferOutputs() const override {
+  bool HasContiguousOutputs() const override {
     return true;
   }
 
diff --git a/dali/operators/segmentation/random_mask_pixel.cc b/dali/operators/segmentation/random_mask_pixel.cc
index a335b701cf..0b72279646 100644
--- a/dali/operators/segmentation/random_mask_pixel.cc
+++ b/dali/operators/segmentation/random_mask_pixel.cc
@@ -56,7 +56,6 @@ If 0, the pixel position is sampled uniformly from all available pixels.)code",
 class RandomMaskPixelCPU : public rng::OperatorWithRng<CPUBackend> {
  public:
   explicit RandomMaskPixelCPU(const OpSpec &spec);
-  bool CanInferOutputs() const override { return true; }
   bool SetupImpl(std::vector<OutputDesc> &output_desc, const Workspace &ws) override;
   void RunImpl(Workspace &ws) override;
 
diff --git a/dali/operators/segmentation/random_object_bbox.h b/dali/operators/segmentation/random_object_bbox.h
index 0994f46b93..0fbb3ee832 100644
--- a/dali/operators/segmentation/random_object_bbox.h
+++ b/dali/operators/segmentation/random_object_bbox.h
@@ -87,10 +87,6 @@ class RandomObjectBBox : public rng::OperatorWithRng<CPUBackend> {
       "Possible values: \"anchor_shape\", \"start_end\" and \"box\"."));
   }
 
-  bool CanInferOutputs() const override {
-    return true;
-  }
-
   bool SetupImpl(vector<OutputDesc> &out_descs, const Workspace &ws) override;
   void RunImpl(Workspace &ws) override;
 
diff --git a/dali/operators/segmentation/select_masks.h b/dali/operators/segmentation/select_masks.h
index 31d3fb35f7..c05b7119b4 100644
--- a/dali/operators/segmentation/select_masks.h
+++ b/dali/operators/segmentation/select_masks.h
@@ -36,10 +36,6 @@ class SelectMasksCPU : public StatelessOperator<CPUBackend> {
   DISABLE_COPY_MOVE_ASSIGN(SelectMasksCPU);
 
  protected:
-  bool CanInferOutputs() const override {
-    return true;
-  }
-
   bool SetupImpl(std::vector<OutputDesc> &output_desc, const Workspace &ws) override;
   void RunImpl(Workspace &ws) override;
 
diff --git a/dali/operators/sequence/element_extract.h b/dali/operators/sequence/element_extract.h
index 2f5bed6290..524240514c 100644
--- a/dali/operators/sequence/element_extract.h
+++ b/dali/operators/sequence/element_extract.h
@@ -83,10 +83,6 @@ class ElementExtract : public StatelessOperator<Backend> {
   }
 
  protected:
-  bool CanInferOutputs() const override {
-    return true;
-  }
-
   bool SetupImpl(std::vector<OutputDesc> &output_desc, const Workspace &ws) override {
     const auto &input = ws.Input<Backend>(0);
     output_desc.resize(element_map_.size());
diff --git a/dali/operators/sequence/optical_flow/optical_flow.h b/dali/operators/sequence/optical_flow/optical_flow.h
index 636a858a2b..21fe3374a7 100644
--- a/dali/operators/sequence/optical_flow/optical_flow.h
+++ b/dali/operators/sequence/optical_flow/optical_flow.h
@@ -133,10 +133,6 @@ class OpticalFlow : public StatelessOperator<Backend> {
 
   void RunImpl(Workspace &ws) override;
 
-  bool CanInferOutputs() const override {
-    return true;
-  }
-
  private:
   /**
    * Optical flow lazy initialization
diff --git a/dali/operators/sequence/per_frame.h b/dali/operators/sequence/per_frame.h
index c7d656b25e..85bc34ce89 100644
--- a/dali/operators/sequence/per_frame.h
+++ b/dali/operators/sequence/per_frame.h
@@ -33,9 +33,8 @@ class PerFrame : public StatelessOperator<Backend> {
       : StatelessOperator<Backend>(spec), replace_(spec.GetArgument<bool>("replace")) {}
 
  protected:
-  bool CanInferOutputs() const override {
-    // Return false to prevent executor from allocating memory for the output,
-    // even though the output shape could be inferred, as it is same as input
+  bool HasContiguousOutputs() const override {
+    // This operator creates more samples by replicating existing ones.
     return false;
   }
 
diff --git a/dali/operators/sequence/sequence_rearrange.h b/dali/operators/sequence/sequence_rearrange.h
index c7ccfb679c..e05f3e8a15 100644
--- a/dali/operators/sequence/sequence_rearrange.h
+++ b/dali/operators/sequence/sequence_rearrange.h
@@ -62,10 +62,6 @@ class SequenceRearrange : public StatelessOperator<Backend> {
   DISABLE_COPY_MOVE_ASSIGN(SequenceRearrange);
 
  protected:
-  bool CanInferOutputs() const override {
-    return true;
-  }
-
   bool SetupImpl(std::vector<OutputDesc>& output_desc, const Workspace &ws) override {
     const auto& input = ws.Input<Backend>(0);
     const auto& in_shape = input.shape();  // temporary in some cases
diff --git a/dali/operators/signal/decibel/to_decibels_op.h b/dali/operators/signal/decibel/to_decibels_op.h
index 8464bdcdc2..a9efa2f3fa 100644
--- a/dali/operators/signal/decibel/to_decibels_op.h
+++ b/dali/operators/signal/decibel/to_decibels_op.h
@@ -50,7 +50,6 @@ class ToDecibels : public StatelessOperator<Backend> {
   }
 
  protected:
-  bool CanInferOutputs() const override { return true; }
   bool SetupImpl(std::vector<OutputDesc> &output_desc, const Workspace &ws) override;
   void RunImpl(Workspace &ws) override;
 
diff --git a/dali/operators/signal/fft/power_spectrum.h b/dali/operators/signal/fft/power_spectrum.h
index 59f55f511f..2e5d8953f8 100644
--- a/dali/operators/signal/fft/power_spectrum.h
+++ b/dali/operators/signal/fft/power_spectrum.h
@@ -48,7 +48,6 @@ class PowerSpectrum : public StatelessOperator<Backend> {
   }
 
  protected:
-  bool CanInferOutputs() const override { return true; }
   bool SetupImpl(std::vector<OutputDesc> &output_desc, const Workspace &ws) override;
   void RunImpl(Workspace &ws) override;
 
diff --git a/dali/operators/signal/fft/spectrogram.h b/dali/operators/signal/fft/spectrogram.h
index eb23f53d97..68169eecee 100644
--- a/dali/operators/signal/fft/spectrogram.h
+++ b/dali/operators/signal/fft/spectrogram.h
@@ -32,8 +32,6 @@ class DLL_PUBLIC Spectrogram : public StatelessOperator<Backend> {
   DLL_PUBLIC ~Spectrogram() override = default;
 
  protected:
-  bool CanInferOutputs() const override { return true; }
-
   bool SetupImpl(std::vector<OutputDesc> &output_desc, const Workspace &ws) override {
     assert(impl_ != nullptr);
     return impl_->SetupImpl(output_desc, ws);
diff --git a/dali/operators/ssd/box_encoder.h b/dali/operators/ssd/box_encoder.h
index ce01d60965..a245d34252 100644
--- a/dali/operators/ssd/box_encoder.h
+++ b/dali/operators/ssd/box_encoder.h
@@ -70,6 +70,10 @@ class BoxEncoder<CPUBackend>: public StatelessOperator<CPUBackend> {
   DISABLE_COPY_MOVE_ASSIGN(BoxEncoder);
 
  protected:
+  bool HasContiguousOutputs() const override {
+    return false;
+  }
+
   bool SetupImpl(std::vector<OutputDesc> &output_desc, const Workspace &ws) override {
     return false;
   }
diff --git a/dali/operators/ssd/random_crop.h b/dali/operators/ssd/random_crop.h
index 60bd6eb0b7..626b3f6d58 100644
--- a/dali/operators/ssd/random_crop.h
+++ b/dali/operators/ssd/random_crop.h
@@ -55,6 +55,10 @@ class SSDRandomCrop : public rng::OperatorWithRng<Backend> {
   using Operator<Backend>::RunImpl;
 
  protected:
+  bool HasContiguousOutputs() const override {
+    return false;
+  }
+
   bool SetupImpl(std::vector<OutputDesc> &output_desc, const Workspace &ws) override {
     return false;
   }
diff --git a/dali/operators/util/get_property.h b/dali/operators/util/get_property.h
index 59c0a03ef1..e7a54c0d06 100644
--- a/dali/operators/util/get_property.h
+++ b/dali/operators/util/get_property.h
@@ -35,7 +35,7 @@ class GetProperty : public StatelessOperator<Backend> {
         property_reader_(GetPropertyReader(property_key_)) {}
 
  protected:
-  bool CanInferOutputs() const override {
+  bool HasContiguousOutputs() const override {
     return false;  // we may broadcast a common value to all samples
   }
 
diff --git a/dali/pipeline/data/copy_to_external.h b/dali/pipeline/data/copy_to_external.h
index 4ec1ed40da..1de0a68d57 100644
--- a/dali/pipeline/data/copy_to_external.h
+++ b/dali/pipeline/data/copy_to_external.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2020-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// Copyright (c) 2020-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -69,7 +69,8 @@ inline void CopyToExternalImpl(void* dst,
   }
 
   if (src.IsContiguous()) {
-    type_info.template Copy<DstBackend, SrcBackend>(dst, unsafe_raw_data(src), src._num_elements(),
+    type_info.template Copy<DstBackend, SrcBackend>(dst, contiguous_raw_data(src),
+                                                    src._num_elements(),
                                                     order.stream(), use_copy_kernel);
   } else {
     const auto &src_shape = src.shape();
@@ -114,7 +115,7 @@ inline void CopyToExternalImpl(void** dsts,
   int samples_to_copy = sizes.size();
 
   if (src.IsContiguous() && samples_to_copy == num_samples) {
-    type_info.template Copy<DstBackend, SrcBackend>(dsts, unsafe_raw_data(src), sizes.data(),
+    type_info.template Copy<DstBackend, SrcBackend>(dsts, contiguous_raw_data(src), sizes.data(),
                                                     num_samples, order.stream(), use_copy_kernel);
 
   } else {
diff --git a/dali/pipeline/data/tensor_list.cc b/dali/pipeline/data/tensor_list.cc
index 75646ef5af..75bd24fd55 100644
--- a/dali/pipeline/data/tensor_list.cc
+++ b/dali/pipeline/data/tensor_list.cc
@@ -148,14 +148,15 @@ template <typename DstBackend, typename SrcBackend, template <typename> typename
 void CopyImpl(DstBatch<DstBackend> &dst, const SrcBatch<SrcBackend> &src, const TypeInfo &type_info,
               AccessOrder copy_order, bool use_copy_kernel = false) {
   if (dst.IsContiguous() && src.IsContiguous()) {
-    type_info.Copy<DstBackend, SrcBackend>(unsafe_raw_mutable_data(dst), unsafe_raw_data(src),
+    type_info.Copy<DstBackend, SrcBackend>(contiguous_raw_mutable_data(dst),
+                                           contiguous_raw_data(src),
                                            dst.shape().num_elements(), copy_order.stream(),
                                            use_copy_kernel);
   } else if (dst.IsContiguous() && !src.IsContiguous()) {
-    copy_impl::CopySamplewiseImpl<DstBackend, SrcBackend>(unsafe_raw_mutable_data(dst), src,
+    copy_impl::CopySamplewiseImpl<DstBackend, SrcBackend>(contiguous_raw_mutable_data(dst), src,
                                                           type_info, copy_order, use_copy_kernel);
   } else if (!dst.IsContiguous() && src.IsContiguous()) {
-    copy_impl::CopySamplewiseImpl<DstBackend, SrcBackend>(dst, unsafe_raw_data(src), type_info,
+    copy_impl::CopySamplewiseImpl<DstBackend, SrcBackend>(dst, contiguous_raw_data(src), type_info,
                                                           copy_order, use_copy_kernel);
   } else {
     copy_impl::CopySamplewiseImpl<DstBackend, SrcBackend>(dst, src, type_info, copy_order,
@@ -1047,7 +1048,27 @@ void TensorList<Backend>::resize_tensors(int new_size) {
 
 template <typename Backend>
 void TensorList<Backend>::UpdatePropertiesFromSamples(bool contiguous) {
+  if (contiguous) {
+    bool is_really_contiguous = true;
+
+    const uint8_t *base_ptr = static_cast<const uint8_t *>(contiguous_buffer_.raw_data());
+    size_t size = type_info().size();
+
+    for (int i = 0; i < num_samples(); ++i) {
+      if (tensors_[i].raw_data() == nullptr)
+        DALI_ENFORCE(shape_[i].num_elements() == 0,
+                     "Internal error: a non-empty sample has a null data pointer.");
+      if (base_ptr != tensors_[i].raw_data()) {
+        is_really_contiguous = false;
+        break;
+      }
+      base_ptr += shape_[i].num_elements() * size;
+    }
+    DALI_ENFORCE(is_really_contiguous,
+                 "Internal error: The tensor list isn't really contiguous as claimed.");
+  }
   state_.Update(contiguous ? BatchContiguity::Contiguous : BatchContiguity::Noncontiguous);
+
   // assume that the curr_num_tensors_ is valid
   DALI_ENFORCE(curr_num_tensors_ > 0,
                "Unexpected empty output of per-sample operator. Internal DALI error.");
@@ -1128,7 +1149,6 @@ bool TensorList<Backend>::shares_data() const {
   return false;
 }
 
-
 template class DLL_PUBLIC TensorList<CPUBackend>;
 template class DLL_PUBLIC TensorList<GPUBackend>;
 template void TensorList<CPUBackend>::Copy<CPUBackend>(const TensorList<CPUBackend> &, AccessOrder,
diff --git a/dali/pipeline/data/tensor_list.h b/dali/pipeline/data/tensor_list.h
index ad7d2e1b8d..c083f9c82e 100644
--- a/dali/pipeline/data/tensor_list.h
+++ b/dali/pipeline/data/tensor_list.h
@@ -816,33 +816,39 @@ class DLL_PUBLIC TensorList {
    * @brief Return an un-typed pointer to the underlying storage.
    * The TensorList must be either empty or have a valid type and be contiguous.
    */
-  friend void *unsafe_raw_mutable_data(TensorList<Backend> &batch) {
-    DALI_ENFORCE(batch.IsContiguous(), "Data pointer can be obtain only for contiguous batch.");
-    return batch.contiguous_buffer_.raw_mutable_data();
+  friend void *contiguous_raw_mutable_data(TensorList<Backend> &batch) {
+    DALI_ENFORCE(batch.IsContiguousInMemory(),
+                 "Data pointer can be obtained only for contiguous batch.");
+    if (batch.IsContiguous()) {
+      return batch.contiguous_buffer_.raw_mutable_data();
+    } else {
+      for (int i = 0; i < batch.num_samples(); i++) {
+        if (void *sample_data = batch.tensors_[i].raw_mutable_data()) {
+          return sample_data;
+        } else {
+          assert(batch.shape_[i].num_elements() == 0);
+        }
+      }
+      return nullptr;  // there are 0 elements
+    }
   }
 
   /**
    * @brief Return an un-typed const pointer to the underlying storage.
    * The TensorList must be either empty or have a valid type and be contiguous.
    */
-  friend const void *unsafe_raw_data(const TensorList<Backend> &batch) {
-    DALI_ENFORCE(batch.IsContiguous(), "Data pointer can be obtain only for contiguous batch.");
-    return batch.contiguous_buffer_.raw_data();
+  friend const void *contiguous_raw_data(const TensorList<Backend> &batch) {
+    return contiguous_raw_mutable_data(const_cast<TensorList<Backend> &>(batch));
   }
 
+
   /**
    * @brief Return the shared pointer, that we can use to correctly share the ownership of sample
    * with.
    * Sample 0 is aliased with the whole buffer, if it is contiguous.
    */
-  friend shared_ptr<void> unsafe_sample_owner(TensorList<Backend> &batch, int sample_idx) {
-    // create new aliasing pointer to current data allocation, so we share the use count
-    // and the deleter correctly.
-    if (batch.IsContiguous()) {
-      return {batch.contiguous_buffer_.get_data_ptr(), batch.raw_mutable_tensor(sample_idx)};
-    } else {
-      return batch.tensors_[sample_idx].get_data_ptr();
-    }
+  friend const shared_ptr<void> &unsafe_sample_owner(TensorList<Backend> &batch, int sample_idx) {
+    return batch.tensors_[sample_idx].get_data_ptr();
   }
 
   /**
diff --git a/dali/pipeline/data/tensor_list_test.cc b/dali/pipeline/data/tensor_list_test.cc
index b1bda90c19..3e2d001c7f 100644
--- a/dali/pipeline/data/tensor_list_test.cc
+++ b/dali/pipeline/data/tensor_list_test.cc
@@ -227,7 +227,7 @@ TYPED_TEST(TensorListTest, TestReserveResize) {
   ASSERT_EQ(tl.capacity(), shape.num_elements() * sizeof(float));
   ASSERT_EQ(tl.nbytes(), 0);
   ASSERT_EQ(tl._num_elements(), 0);
-  ASSERT_NE(unsafe_raw_data(tl), nullptr);
+  ASSERT_NE(contiguous_raw_data(tl), nullptr);
 
   // Give the tensor a type
   tl.template set_type<float>();
@@ -301,7 +301,7 @@ TYPED_TEST(TensorListTest, TestGetContiguousPointer) {
   ASSERT_EQ(tl.nbytes(), volume * sizeof(uint32_t));
   ASSERT_EQ(tl.type(), DALI_UINT32);
   ASSERT_TRUE(tl.IsContiguous());
-  ASSERT_NE(unsafe_raw_data(tl), nullptr);
+  ASSERT_NE(contiguous_raw_data(tl), nullptr);
 }
 
 TYPED_TEST(TensorListTest, TestGetBytesThenAccess) {
@@ -568,7 +568,7 @@ TYPED_TEST(TensorListTest, TestTypeChange) {
   DALIDataType initial_type = DALI_FLOAT;
   std::array<DALIDataType, 4> types = {DALI_FLOAT, DALI_INT32, DALI_UINT8, DALI_FLOAT64};
   const auto *base_ptr =
-      this->kContiguity == BatchContiguity::Contiguous ? unsafe_raw_data(tensor_list) : nullptr;
+      this->kContiguity == BatchContiguity::Contiguous ? contiguous_raw_data(tensor_list) : nullptr;
   size_t nbytes = shape.num_elements() * sizeof(float);
 
   // Save the pointers
@@ -597,7 +597,7 @@ TYPED_TEST(TensorListTest, TestTypeChange) {
     // The side-effects of only reallocating when we need a bigger buffer, we may use padding
     if (TypeTable::GetTypeInfo(new_type).size() <= TypeTable::GetTypeInfo(initial_type).size()) {
       if (this->kContiguity == BatchContiguity::Contiguous) {
-        ASSERT_EQ(unsafe_raw_data(tensor_list), base_ptr);
+        ASSERT_EQ(contiguous_raw_data(tensor_list), base_ptr);
       } else {
         for (int i = 0; i < tensor_list.num_samples(); ++i) {
           ASSERT_EQ(tensor_list.raw_tensor(i), ptrs[i]);
@@ -1199,7 +1199,7 @@ TYPED_TEST(TensorListSuite, ResizeSetSize) {
   tv.Resize(new_shape);
   tv.SetLayout("HWC");
 
-  const auto *base = static_cast<const uint8_t *>(unsafe_raw_data(tv));
+  const auto *base = static_cast<const uint8_t *>(contiguous_raw_data(tv));
 
   for (int i = 0; i < 3; i++) {
     EXPECT_EQ(tv[i].raw_data(), base);
@@ -1222,7 +1222,7 @@ TYPED_TEST(TensorListSuite, ResizeSetSize) {
   tv.SetSize(3);
   EXPECT_TRUE(tv.IsContiguous());
 
-  base = static_cast<const uint8_t *>(unsafe_raw_data(tv));
+  base = static_cast<const uint8_t *>(contiguous_raw_data(tv));
 
   for (int i = 0; i < 2; i++) {
     EXPECT_EQ(tv[i].raw_data(), base);
@@ -1279,7 +1279,7 @@ TYPED_TEST(TensorListSuite, ContiguousResize) {
     tv.CopySample(i, tv, i);
   }
 
-  const auto *base = static_cast<const uint8_t *>(unsafe_raw_data(tv));
+  const auto *base = static_cast<const uint8_t *>(contiguous_raw_data(tv));
 
   EXPECT_TRUE(tv.IsContiguous());
   for (int i = 0; i < 3; i++) {
@@ -1649,11 +1649,11 @@ TYPED_TEST(TensorListSuite, EmptyTensorListAsTensorAccess) {
     auto tensor_2d = tv.AsReshapedTensor(shape_2d);
     EXPECT_EQ(tensor_1d.shape(), shape_1d);
     EXPECT_EQ(tensor_1d.type(), DALI_INT32);
-    EXPECT_EQ(tensor_1d.raw_data(), unsafe_raw_data(tv));
+    EXPECT_EQ(tensor_1d.raw_data(), contiguous_raw_data(tv));
     EXPECT_EQ(tensor_1d.raw_data(), nullptr);
     EXPECT_EQ(tensor_2d.shape(), shape_2d);
     EXPECT_EQ(tensor_2d.type(), DALI_INT32);
-    EXPECT_EQ(tensor_2d.raw_data(), unsafe_raw_data(tv));
+    EXPECT_EQ(tensor_2d.raw_data(), contiguous_raw_data(tv));
     EXPECT_EQ(tensor_2d.raw_data(), nullptr);
   }
 
@@ -1665,11 +1665,11 @@ TYPED_TEST(TensorListSuite, EmptyTensorListAsTensorAccess) {
     auto tensor_2d = tv.AsReshapedTensor(shape_2d);
     EXPECT_EQ(tensor_1d.shape(), shape_1d);
     EXPECT_EQ(tensor_1d.type(), DALI_INT32);
-    EXPECT_EQ(tensor_1d.raw_data(), unsafe_raw_data(tv));
+    EXPECT_EQ(tensor_1d.raw_data(), contiguous_raw_data(tv));
     EXPECT_NE(tensor_1d.raw_data(), nullptr);
     EXPECT_EQ(tensor_2d.shape(), shape_2d);
     EXPECT_EQ(tensor_2d.type(), DALI_INT32);
-    EXPECT_EQ(tensor_2d.raw_data(), unsafe_raw_data(tv));
+    EXPECT_EQ(tensor_2d.raw_data(), contiguous_raw_data(tv));
     EXPECT_NE(tensor_2d.raw_data(), nullptr);
   }
 }
@@ -1694,14 +1694,14 @@ TYPED_TEST(TensorListSuite, EmptyTensorListWithDimAsTensorAccess) {
     auto tensor_1d = tv.AsTensor();
     EXPECT_EQ(tensor_1d.shape(), shape_1d);
     EXPECT_EQ(tensor_1d.type(), DALI_INT32);
-    EXPECT_EQ(tensor_1d.raw_data(), unsafe_raw_data(tv));
+    EXPECT_EQ(tensor_1d.raw_data(), contiguous_raw_data(tv));
     EXPECT_EQ(tensor_1d.raw_data(), nullptr);
 
     tv.set_sample_dim(2);
     auto tensor_2d = tv.AsTensor();
     EXPECT_EQ(tensor_2d.shape(), shape_2d);
     EXPECT_EQ(tensor_2d.type(), DALI_INT32);
-    EXPECT_EQ(tensor_2d.raw_data(), unsafe_raw_data(tv));
+    EXPECT_EQ(tensor_2d.raw_data(), contiguous_raw_data(tv));
     EXPECT_EQ(tensor_2d.raw_data(), nullptr);
   }
 }
diff --git a/dali/pipeline/executor/executor2/exec2_ops_for_test.h b/dali/pipeline/executor/executor2/exec2_ops_for_test.h
index 17e2c4b772..cfcb206f6c 100644
--- a/dali/pipeline/executor/executor2/exec2_ops_for_test.h
+++ b/dali/pipeline/executor/executor2/exec2_ops_for_test.h
@@ -72,7 +72,6 @@ class DummyOpCPU : public Operator<CPUBackend> {
       *ws.Output<CPUBackend>(0)[s].mutable_data<int>() = sample_sums_[s];
   }
 
-  bool CanInferOutputs() const override { return true; }
   ArgValue<int> addend_{"addend", spec_};
   double delay_ms_ = 0;
 
@@ -100,7 +99,6 @@ class DummyOpGPU : public Operator<GPUBackend> {
 
   void RunImpl(Workspace &ws) override;
 
-  bool CanInferOutputs() const override { return true; }
 
  private:
   ArgValue<int> addend_{"addend", spec_};
@@ -136,7 +134,6 @@ class CounterOp : public Operator<CPUBackend> {
     }
   }
 
-  bool CanInferOutputs() const override { return true; }
 
   int counter = 0;
 };
@@ -172,7 +169,6 @@ class SinkOp : public Operator<CPUBackend> {
     }
   }
 
-  bool CanInferOutputs() const override { return true; }
 
   int64_t acc = 0;
 };
diff --git a/dali/pipeline/executor/executor2/exec_graph_analysis.cc b/dali/pipeline/executor/executor2/exec_graph_analysis.cc
index 173ca29c57..eac66167a4 100644
--- a/dali/pipeline/executor/executor2/exec_graph_analysis.cc
+++ b/dali/pipeline/executor/executor2/exec_graph_analysis.cc
@@ -20,6 +20,7 @@
 #include <vector>
 #include "dali/pipeline/executor/executor2/exec_graph.h"
 #include "dali/pipeline/graph/op_graph2.h"
+#include "dali/pipeline/operator/builtin/make_contiguous.h"
 
 namespace dali {
 namespace exec2 {
@@ -47,6 +48,13 @@ class ExecGraph::Analyzer {
     }
   }
 
+  void SetMakeContiguousMode(ExecGraph &g) {
+    for (auto &node : g.Nodes()) {
+      if (node.op)
+        dali::SetMakeContiguousMode(*node.op, MakeContiguousMode::Opportunistic);
+    }
+  }
+
   bool HasParallelConsumers(const ExecOutputDesc &out) {
     int ncons = out.consumers.size();
     // If there's just one outgoing edge from that input, we're safe.
@@ -191,6 +199,7 @@ void ExecGraph::Analyze() {
   if (analyzed_)
     return;
   Analyzer a;
+  a.SetMakeContiguousMode(*this);
   a.MarkPinnedBuffers(*this);
   a.MarkOutputsWithParallelConsumers(*this);
   analyzed_ = true;
diff --git a/dali/pipeline/executor/executor2/stream_assignment_test.cc b/dali/pipeline/executor/executor2/stream_assignment_test.cc
index a202162648..42feda8928 100644
--- a/dali/pipeline/executor/executor2/stream_assignment_test.cc
+++ b/dali/pipeline/executor/executor2/stream_assignment_test.cc
@@ -31,6 +31,11 @@ class StreamAssignmentDummyOp : public Operator<Backend> {
   USE_OPERATOR_MEMBERS();
 
   void RunImpl(Workspace &ws) override {}
+
+  bool HasContiguousOutputs() const override {
+    return false;
+  }
+
   bool SetupImpl(std::vector<OutputDesc> &output_desc, const Workspace &ws) override {
     return false;
   }
diff --git a/dali/pipeline/executor/executor_impl.cc b/dali/pipeline/executor/executor_impl.cc
index 19ef9c7e3a..f84f1d0e55 100644
--- a/dali/pipeline/executor/executor_impl.cc
+++ b/dali/pipeline/executor/executor_impl.cc
@@ -392,6 +392,14 @@ void Executor<WorkspacePolicy, QueuePolicy>::RunGPU() {
   }
 }
 
+inline void AssertOutputsContiguous(const Workspace &ws) {
+  for (int i = 0; i < ws.NumOutput(); i++) {
+    if (ws.OutputIsType<CPUBackend>(i))
+      assert(ws.Output<CPUBackend>(i).IsContiguousInMemory());
+    else if (ws.OutputIsType<GPUBackend>(i))
+      assert(ws.Output<GPUBackend>(i).IsContiguousInMemory());
+  }
+}
 
 template<typename WorkspacePolicy, typename QueuePolicy>
 void Executor<WorkspacePolicy, QueuePolicy>::RunHelper(OpNode &op_node, Workspace &ws,
@@ -495,10 +503,6 @@ void Executor<WorkspacePolicy, QueuePolicy>::RunHelper(OpNode &op_node, Workspac
       DALI_ENFORCE(
           static_cast<size_t>(ws.NumOutput()) == output_desc.size(),
           "Operator::Setup returned shape and type information for mismatched number of outputs");
-      DALI_ENFORCE(op.CanInferOutputs(),
-                    "Operator::Setup returned true indicating that it successfully calculated "
-                    "shape and type information for Operator outputs. In that case "
-                    "CanInferOutputs should always return true.");
       for (int i = 0; i < ws.NumOutput(); i++) {
         auto &desc = output_desc[i];
         if (ws.OutputIsType<CPUBackend>(i)) {
@@ -507,11 +511,6 @@ void Executor<WorkspacePolicy, QueuePolicy>::RunHelper(OpNode &op_node, Workspac
           ws.Output<GPUBackend>(i).Resize(desc.shape, desc.type);
         }
       }
-    } else {
-      DALI_ENFORCE(!op.CanInferOutputs(),
-                    "Operator::Setup returned false indicating that it cannot calculate shape and "
-                    "type information for Operator outputs. In that case CanInferOutputs should "
-                    "always return false.");
     }
   }
 
@@ -522,6 +521,9 @@ void Executor<WorkspacePolicy, QueuePolicy>::RunHelper(OpNode &op_node, Workspac
     op.Run(ws);
   }
 
+  if (op.HasContiguousOutputs())
+    AssertOutputsContiguous(ws);
+
   PropagateSourceInfo(ws);
 
   /* TODO(michalz): Find a way to make this valid in presence of passthrough between stages
diff --git a/dali/pipeline/executor/executor_impl.h b/dali/pipeline/executor/executor_impl.h
index 21cbda936e..cba013b150 100644
--- a/dali/pipeline/executor/executor_impl.h
+++ b/dali/pipeline/executor/executor_impl.h
@@ -716,7 +716,7 @@ void Executor<WorkspacePolicy, QueuePolicy>::PresizeData(
   };
 
   auto reserve_batch = [](auto &storage, Index hint, int batch_size) {
-    // If the batch was marked as contiguous (for example due to op.CanInferOutputs being true)
+    // If the batch was marked as contiguous (for example due to op.HasContiguousOutputs being true)
     // reserve a contiguous batch.
     if (storage->IsContiguous()) {
       storage->reserve(hint * batch_size);
@@ -748,7 +748,7 @@ void Executor<WorkspacePolicy, QueuePolicy>::PresizeData(
             if (op_type_static == OpType::MIXED) {
               storage->SetContiguity(BatchContiguity::Contiguous);
             }
-            if (node.op->CanInferOutputs()) {
+            if (node.op->HasContiguousOutputs()) {
               storage->SetContiguity(BatchContiguity::Contiguous);
             }
             if (should_reserve(storage, hint, dev_static)) {
diff --git a/dali/pipeline/executor/lowered_graph.cc b/dali/pipeline/executor/lowered_graph.cc
index 627ac40d7d..3b130fc2b2 100644
--- a/dali/pipeline/executor/lowered_graph.cc
+++ b/dali/pipeline/executor/lowered_graph.cc
@@ -481,7 +481,7 @@ bool OpGraph::IsAlwaysContiguous(TensorNodeId tensor_id) const {
 
   // If the input is inferred, the allocation is done by executor in contiguous fashion
   // this means we can just pass through the data instead of copying them.
-  bool is_input_always_contiguous = producer_op_node.op->CanInferOutputs();
+  bool is_input_always_contiguous = producer_op_node.op->HasContiguousOutputs();
   if (is_input_always_contiguous) {
     return true;
   }
diff --git a/dali/pipeline/operator/builtin/conditional/logical_not.h b/dali/pipeline/operator/builtin/conditional/logical_not.h
index 8092190b16..3d89fb4c02 100644
--- a/dali/pipeline/operator/builtin/conditional/logical_not.h
+++ b/dali/pipeline/operator/builtin/conditional/logical_not.h
@@ -34,10 +34,6 @@ class LogicalNot : public StatelessOperator<CPUBackend> {
 
   ~LogicalNot() override = default;
 
-  bool CanInferOutputs() const override {
-    return true;
-  }
-
   bool SetupImpl(std::vector<OutputDesc> &output_desc, const Workspace &ws) override;
   void RunImpl(Workspace &ws) override;
 
@@ -59,6 +55,10 @@ class LogicalNotFailForGpu : public Operator<GPUBackend> {
     ReportGpuInputError("not", "", true);
   }
 
+  bool HasContiguousOutputs() const override {
+    return false;
+  }
+
   bool SetupImpl(std::vector<OutputDesc> &output_desc, const Workspace &ws) override {
     return false;
   }
diff --git a/dali/pipeline/operator/builtin/conditional/merge.h b/dali/pipeline/operator/builtin/conditional/merge.h
index e00ba016a8..a35f0c6bfa 100644
--- a/dali/pipeline/operator/builtin/conditional/merge.h
+++ b/dali/pipeline/operator/builtin/conditional/merge.h
@@ -36,7 +36,7 @@ class Merge : public StatelessOperator<Backend> {
 
   ~Merge() override = default;
 
-  bool CanInferOutputs() const override {
+  bool HasContiguousOutputs() const override {
     return false;
   }
 
diff --git a/dali/pipeline/operator/builtin/conditional/split.h b/dali/pipeline/operator/builtin/conditional/split.h
index 463d6a2729..6966855d1a 100644
--- a/dali/pipeline/operator/builtin/conditional/split.h
+++ b/dali/pipeline/operator/builtin/conditional/split.h
@@ -36,7 +36,7 @@ class Split : public StatelessOperator<Backend> {
 
   ~Split() override = default;
 
-  bool CanInferOutputs() const override {
+  bool HasContiguousOutputs() const override {
     return false;
   }
 
diff --git a/dali/pipeline/operator/builtin/conditional/validate_logical_expr.h b/dali/pipeline/operator/builtin/conditional/validate_logical_expr.h
index 571431c610..a259e2ac68 100644
--- a/dali/pipeline/operator/builtin/conditional/validate_logical_expr.h
+++ b/dali/pipeline/operator/builtin/conditional/validate_logical_expr.h
@@ -38,7 +38,7 @@ class LogicalValidate : public StatelessOperator<CPUBackend> {
 
   ~LogicalValidate() override = default;
 
-  bool CanInferOutputs() const override {
+  bool HasContiguousOutputs() const override {
     return false;
   }
 
@@ -67,6 +67,10 @@ class LogicalFailForGpu : public StatelessOperator<GPUBackend> {
     ReportGpuInputError(name_, side_, true);
   }
 
+  bool HasContiguousOutputs() const override {
+    return false;
+  }
+
   bool SetupImpl(std::vector<OutputDesc> &output_desc, const Workspace &ws) override {
     return false;
   }
diff --git a/dali/pipeline/operator/builtin/copy.h b/dali/pipeline/operator/builtin/copy.h
index 91c2995ea6..536de58d04 100644
--- a/dali/pipeline/operator/builtin/copy.h
+++ b/dali/pipeline/operator/builtin/copy.h
@@ -34,10 +34,6 @@ class Copy : public StatelessOperator<Backend> {
   DISABLE_COPY_MOVE_ASSIGN(Copy);
 
  protected:
-  bool CanInferOutputs() const override {
-    return true;
-  }
-
   bool SetupImpl(std::vector<OutputDesc> &output_desc, const Workspace &ws) override {
     output_desc.resize(1);
     output_desc[0].type = ws.GetInputDataType(0);
diff --git a/dali/pipeline/operator/builtin/external_source.h b/dali/pipeline/operator/builtin/external_source.h
index fc34787689..fd6bc9ae88 100644
--- a/dali/pipeline/operator/builtin/external_source.h
+++ b/dali/pipeline/operator/builtin/external_source.h
@@ -110,9 +110,7 @@ class ExternalSource : public InputOperator<Backend> {
   }
 
 
-  bool CanInferOutputs() const override {
-    // shape inference during setup is disabled because it can be calculated during the runtime
-    // depending on the input and output
+  bool HasContiguousOutputs() const override {
     return false;
   }
 
diff --git a/dali/pipeline/operator/builtin/input_operator.h b/dali/pipeline/operator/builtin/input_operator.h
index 42d4839655..b5b01e5042 100644
--- a/dali/pipeline/operator/builtin/input_operator.h
+++ b/dali/pipeline/operator/builtin/input_operator.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// Copyright (c) 2022-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -440,7 +440,7 @@ class InputOperator : public Operator<Backend>, virtual public BatchSizeProvider
     bool copied_shared_data = false;
 
     if (batch.IsContiguous()) {
-      auto batch_owner = unsafe_sample_owner(const_cast<TensorList<SrcBackend> &>(batch), 0);
+      auto &batch_owner = unsafe_sample_owner(const_cast<TensorList<SrcBackend> &>(batch), 0);
       tl_elm.front()->ShareData(batch_owner, batch.nbytes(), batch.is_pinned(), batch.shape(),
                                 batch.type(), batch.device_id(), batch.order());
       zero_copy_noncontiguous_gpu_input_ = true;
diff --git a/dali/pipeline/operator/builtin/make_contiguous.cc b/dali/pipeline/operator/builtin/make_contiguous.cc
index e016e80af6..d974257e82 100644
--- a/dali/pipeline/operator/builtin/make_contiguous.cc
+++ b/dali/pipeline/operator/builtin/make_contiguous.cc
@@ -22,7 +22,7 @@ void MakeContiguousCPU::RunImpl(Workspace &ws) {
   auto &output = ws.Output<CPUBackend>(0);
 
   DomainTimeRange tr("[DALI][MakeContiguousCPU] H2H", DomainTimeRange::kBlue);
-  if (IsPassThrough()) {
+  if (pass_through_) {
     output.ShareData(input);
   } else {
     int batch_size = input.num_samples();
@@ -76,6 +76,18 @@ bool IsPassThrough(const OperatorBase &op) {
   DALI_FAIL("This operation should be called only on MakeContiguous Operators.");
 }
 
+bool SetMakeContiguousMode(OperatorBase &op, MakeContiguousMode mode) {
+  if (auto *make_contiguous_cpu = dynamic_cast<MakeContiguousBase<CPUBackend> *>(&op)) {
+    make_contiguous_cpu->SetMode(mode);
+  } else if (auto *make_contiguous_mixed = dynamic_cast<MakeContiguousBase<MixedBackend> *>(&op)) {
+    make_contiguous_mixed->SetMode(mode);
+  } else if (auto *make_contiguous_gpu = dynamic_cast<MakeContiguousBase<GPUBackend> *>(&op)) {
+    make_contiguous_gpu->SetMode(mode);
+  } else {
+    return false;
+  }
+  return true;
+}
 
 DALI_SCHEMA(MakeContiguous)
   .DocStr(R"code(Move input batch to a contiguous representation, more suitable for execution on the GPU)code")
diff --git a/dali/pipeline/operator/builtin/make_contiguous.cu b/dali/pipeline/operator/builtin/make_contiguous.cu
index 62df1f2cd4..7bbdd66214 100644
--- a/dali/pipeline/operator/builtin/make_contiguous.cu
+++ b/dali/pipeline/operator/builtin/make_contiguous.cu
@@ -39,7 +39,7 @@ void MakeContiguousMixed::RunImpl(Workspace &ws) {
   if (ws.OutputIsType<CPUBackend>(0)) {
     auto &output = ws.Output<CPUBackend>(0);
     DomainTimeRange tr("[DALI][MakeContiguousMixed] H2H non coalesced", DomainTimeRange::kGreen);
-    if (IsPassThrough()) {
+    if (pass_through_) {
       AccessOrder out_order = output.order();
       // A call to ShareData may synchronize the orders and we don't want that.
       // TODO(michalz): Find a less hacky solution.
@@ -70,7 +70,7 @@ void MakeContiguousGPU::RunImpl(Workspace &ws) {
   const auto& input = ws.Input<GPUBackend>(0);
   auto& output = ws.Output<GPUBackend>(0);
   DomainTimeRange tr("[DALI][MakeContiguousGPU] D2D", DomainTimeRange::kGreen);
-  if (IsPassThrough()) {
+  if (pass_through_) {
     output.ShareData(input);
   } else {
     output.Copy(input);
diff --git a/dali/pipeline/operator/builtin/make_contiguous.h b/dali/pipeline/operator/builtin/make_contiguous.h
index f8f9a4cbae..c18dabf4ac 100644
--- a/dali/pipeline/operator/builtin/make_contiguous.h
+++ b/dali/pipeline/operator/builtin/make_contiguous.h
@@ -29,6 +29,12 @@
 
 namespace dali {
 
+enum class MakeContiguousMode {
+  AlwaysCopy,    //! Always perform a copy.
+  PassThrough,   //! Never copy.
+  Opportunistic  //! If already contiguous, pass through; otherwise copy.
+};
+
 template<typename Backend>
 class MakeContiguousBase : public StatelessOperator<Backend> {
  public:
@@ -42,20 +48,20 @@ class MakeContiguousBase : public StatelessOperator<Backend> {
 
   virtual inline ~MakeContiguousBase() = default;
 
-  bool CanInferOutputs() const override {
-    return !pass_through_;
-  }
-
   bool SetupImpl(std::vector<OutputDesc> &output_desc, const Workspace &ws) override {
     output_desc.resize(1);
     if (ws.InputIsType<CPUBackend>(0)) {
       auto &input = ws.Input<CPUBackend>(0);
       output_desc[0].shape = input.shape();
       output_desc[0].type = input.type();
+      pass_through_ = mode_ == MakeContiguousMode::PassThrough ||
+                      (mode_ == MakeContiguousMode::Opportunistic && input.IsContiguousInMemory());
     } else {
       auto &input = ws.Input<GPUBackend>(0);
       output_desc[0].shape = input.shape();
       output_desc[0].type = input.type();
+      pass_through_ = mode_ == MakeContiguousMode::PassThrough ||
+                      (mode_ == MakeContiguousMode::Opportunistic && input.IsContiguousInMemory());
     }
     return !pass_through_;
   }
@@ -69,7 +75,7 @@ class MakeContiguousBase : public StatelessOperator<Backend> {
    * inputs.
    */
   void MarkPassThrough() {
-    pass_through_ = true;
+    mode_ = MakeContiguousMode::PassThrough;
   }
 
   /**
@@ -79,15 +85,25 @@ class MakeContiguousBase : public StatelessOperator<Backend> {
    * on the graph.
    */
   bool IsPassThrough() const {
-    return pass_through_;
+    return mode_ == MakeContiguousMode::PassThrough;
+  }
+
+  void SetMode(MakeContiguousMode mode) {
+    mode_ = mode;
+  }
+
+  MakeContiguousMode GetMode() const {
+    return mode_;
   }
 
  protected:
   USE_OPERATOR_MEMBERS();
   TensorList<CPUBackend> cpu_output_buff;
   bool coalesced = true;
-  int bytes_per_sample_hint = 0;
+  // Whether the next batch would be passed through - this value is changed in Setup.
   bool pass_through_ = false;
+  int bytes_per_sample_hint = 0;
+  MakeContiguousMode mode_ = MakeContiguousMode::AlwaysCopy;
 };
 
 
@@ -132,6 +148,14 @@ void MarkPassThrough(OperatorBase &make_contiguous);
  */
 bool IsPassThrough(const OperatorBase &make_contiguous);
 
+/**
+ * @brief Call the MakeContiguousBase::SetMode, invalid for other operators.
+ *
+ * @return true, if the operator was MakeContiguous and the mode was set, false otherwise.
+ */
+bool SetMakeContiguousMode(OperatorBase &make_contiguous, MakeContiguousMode mode);
+
+
 }  // namespace dali
 
 #endif  // DALI_PIPELINE_OPERATOR_BUILTIN_MAKE_CONTIGUOUS_H_
diff --git a/dali/pipeline/operator/eager_operator.h b/dali/pipeline/operator/eager_operator.h
index e3b094331d..2abaa9da14 100644
--- a/dali/pipeline/operator/eager_operator.h
+++ b/dali/pipeline/operator/eager_operator.h
@@ -307,7 +307,7 @@ EagerOperator<Backend>::RunImpl(
   ws_.SetBatchSizes(batch_size);
 
   // Setup outputs.
-  if (op_->Setup(output_desc, ws_) && op_->CanInferOutputs()) {
+  if (op_->Setup(output_desc, ws_)) {
     for (size_t i = 0; i < num_outputs_; ++i) {
       ws_.Output<OutBackend>(i).Resize(output_desc[i].shape, output_desc[i].type,
                                                 BatchContiguity::Contiguous);
diff --git a/dali/pipeline/operator/false_gpu_operator.h b/dali/pipeline/operator/false_gpu_operator.h
index c8126aa9bc..41ae260e87 100644
--- a/dali/pipeline/operator/false_gpu_operator.h
+++ b/dali/pipeline/operator/false_gpu_operator.h
@@ -51,7 +51,7 @@ class FalseGPUOperator : public Operator<GPUBackend> {
   ~FalseGPUOperator() override = default;
 
  protected:
-  bool CanInferOutputs() const override {
+  bool HasContiguousOutputs() const override {
     // To run Setup we need to first copy from device to host.
     // To avoid delaying the Setup stage, we will do Setup and Run in one go (during Run)
     return false;
diff --git a/dali/pipeline/operator/op_spec_test.cc b/dali/pipeline/operator/op_spec_test.cc
index 6f0728aeba..83707620da 100644
--- a/dali/pipeline/operator/op_spec_test.cc
+++ b/dali/pipeline/operator/op_spec_test.cc
@@ -314,10 +314,6 @@ class TestArgumentInput_Producer : public Operator<CPUBackend> {
  public:
   explicit TestArgumentInput_Producer(const OpSpec &spec) : Operator<CPUBackend>(spec) {}
 
-  bool CanInferOutputs() const override {
-    return true;
-  }
-
   bool SetupImpl(std::vector<OutputDesc> &output_desc, const Workspace &ws) override {
     output_desc.resize(3);
     output_desc[0] = {TensorListShape<0>(ws.GetRequestedBatchSize(0)),         DALI_INT32};
@@ -359,10 +355,6 @@ class TestArgumentInput_Consumer : public Operator<CPUBackend> {
  public:
   explicit TestArgumentInput_Consumer(const OpSpec &spec) : Operator<CPUBackend>(spec) {}
 
-  bool CanInferOutputs() const override {
-    return true;
-  }
-
   bool SetupImpl(std::vector<OutputDesc> &output_desc, const Workspace &ws) override {
     output_desc.resize(1);
     output_desc[0] = {uniform_list_shape(ws.GetRequestedBatchSize(0), {1}), DALI_INT32};
diff --git a/dali/pipeline/operator/operator.h b/dali/pipeline/operator/operator.h
index f37956efe6..dcd4f10770 100644
--- a/dali/pipeline/operator/operator.h
+++ b/dali/pipeline/operator/operator.h
@@ -99,9 +99,12 @@ class DLL_PUBLIC OperatorBase {
   /**
    * @brief Setup of the operator - to be implemented by derived op.
    *
+   * In the setup stage, the operator can determine the shapes and types of the outputs.
+   * If it does, it can request that the executor allocates the output buffers for it.
+   *
    * @param output_desc describe the shape and type of the outputs (for the whole batch)
    * @param ws
-   * @return true iff the operator specified the output shape and type
+   * @return Whether the caller should provide buffers for the outputs.
    */
   virtual bool SetupImpl(std::vector<OutputDesc> &output_desc, const Workspace &ws) = 0;
 
@@ -112,11 +115,18 @@ class DLL_PUBLIC OperatorBase {
   virtual void RunImpl(Workspace &ws) = 0;
 
   /**
-   * @brief If Operator can infer the output shapes it means that its output would use a single
-   * underlying allocation, especially for CPU TensorList will use contiguous mode.
+   * @brief If true (default), the operator's output will be stored as a contiguous buffer.
+   *
+   * The operator should return `true` when:
+   * - it requests the allocation of the outputs (by returning `true` from `SetupImpl`)
+   * - it internally guarantees that the output is contiguous (e.g. `MakeContiguous`)
+   * The operator should return `false` when:
+   * - it allocates the output on a per-sample bases (e.g. readers)
+   * - it shuffles or repeats samples in the batch (e.g. `Constant`, `PermuteBatch`, `PerSample`)
+   * - it forwards the input regardless of its contiguity (e.g. `Reshape`)
    */
-  virtual bool CanInferOutputs() const {
-    return false;
+  virtual bool HasContiguousOutputs() const {
+    return true;
   }
 
   /**
@@ -298,7 +308,7 @@ class DLL_PUBLIC Operator<CPUBackend> : public OperatorBase {
     thread_pool.RunAll();
     // Propagate metadata from individual samples to the whole batch as working with SampleWorkspace
     // breaks metadata consistency - it sets it only to samples
-    FixBatchPropertiesConsistency(ws, CanInferOutputs());
+    FixBatchPropertiesConsistency(ws, HasContiguousOutputs());
   }
 };
 
diff --git a/dali/pipeline/operator/operator_test.cc b/dali/pipeline/operator/operator_test.cc
index 17f518684a..a97b78e10e 100644
--- a/dali/pipeline/operator/operator_test.cc
+++ b/dali/pipeline/operator/operator_test.cc
@@ -57,6 +57,11 @@ namespace {
 class TestOp : public OperatorBase {
  public:
   using OperatorBase::OperatorBase;
+
+  bool HasContiguousOutputs() const override {
+    return false;
+  }
+
   bool SetupImpl(std::vector<OutputDesc> &output_desc, const Workspace &ws) override {
     return false;
   }
diff --git a/dali/pipeline/pipeline_test.cc b/dali/pipeline/pipeline_test.cc
index f339eefc9c..bcffe86ee7 100644
--- a/dali/pipeline/pipeline_test.cc
+++ b/dali/pipeline/pipeline_test.cc
@@ -283,6 +283,10 @@ class DummyPresizeOpCPU : public Operator<CPUBackend> {
       : Operator<CPUBackend>(spec) {
   }
 
+  bool HasContiguousOutputs() const override {
+    return false;
+  }
+
   bool SetupImpl(std::vector<OutputDesc> &output_desc, const Workspace &ws) override {
     return false;
   }
@@ -308,6 +312,10 @@ class DummyPresizeOpGPU : public Operator<GPUBackend> {
       : Operator<GPUBackend>(spec) {
   }
 
+  bool HasContiguousOutputs() const override {
+    return false;
+  }
+
   bool SetupImpl(std::vector<OutputDesc> &output_desc, const Workspace &ws) override {
     return false;
   }
@@ -333,6 +341,10 @@ class DummyPresizeOpMixed : public Operator<MixedBackend> {
       : Operator<MixedBackend>(spec) {
   }
 
+  bool HasContiguousOutputs() const override {
+    return false;
+  }
+
   bool SetupImpl(std::vector<OutputDesc> &output_desc, const Workspace &ws) override {
     return false;
   }
@@ -622,6 +634,10 @@ class DummyOpToAdd : public Operator<CPUBackend> {
  public:
   explicit DummyOpToAdd(const OpSpec &spec) : Operator<CPUBackend>(spec) {}
 
+  bool HasContiguousOutputs() const override {
+    return false;
+  }
+
   bool SetupImpl(std::vector<OutputDesc> &output_desc, const Workspace &ws) override {
     return false;
   }
@@ -641,6 +657,10 @@ class DummyOpNoSync : public Operator<CPUBackend> {
  public:
   explicit DummyOpNoSync(const OpSpec &spec) : Operator<CPUBackend>(spec) {}
 
+  bool HasContiguousOutputs() const override {
+    return false;
+  }
+
   bool SetupImpl(std::vector<OutputDesc> &output_desc, const Workspace &ws) override {
     return false;
   }
diff --git a/dali/python/backend_impl.cc b/dali/python/backend_impl.cc
index 129260cebf..0533ec59e6 100644
--- a/dali/python/backend_impl.cc
+++ b/dali/python/backend_impl.cc
@@ -832,7 +832,7 @@ std::unique_ptr<Tensor<Backend> > TensorListGetItemImpl(TensorList<Backend> &t,
   }
   auto ptr = std::make_unique<Tensor<Backend>>();
   // TODO(klecki): Rework this with proper sample-based tensor batch data structure
-  auto sample_shared_ptr = unsafe_sample_owner(t, id);
+  auto &sample_shared_ptr = unsafe_sample_owner(t, id);
   ptr->ShareData(sample_shared_ptr, t.capacity(), t.is_pinned(), t.shape()[id], t.type(),
                  t.device_id(), t.order());
   ptr->SetMeta(t.GetMeta(id));
@@ -1126,7 +1126,7 @@ void ExposeTensorList(py::module &m) {
                 "buffer info for tensor w/ invalid type.");
             DALI_ENFORCE(tl.IsDenseTensor(),
                         "Tensors in the list must have the same shape");
-            raw_mutable_data = unsafe_raw_mutable_data(tl);
+            raw_mutable_data = contiguous_raw_mutable_data(tl);
           }
 
           if (IsValidType(tl.type())) {
@@ -1207,7 +1207,7 @@ void ExposeTensorList(py::module &m) {
     .def("data_ptr",
         [](TensorList<CPUBackend> &tl) {
           return py::reinterpret_borrow<py::object>(
-              PyLong_FromVoidPtr(unsafe_raw_mutable_data(tl)));
+              PyLong_FromVoidPtr(contiguous_raw_mutable_data(tl)));
         },
       R"code(
       Returns the address of the first element of TensorList.
@@ -1413,7 +1413,7 @@ void ExposeTensorList(py::module &m) {
     .def("data_ptr",
         [](TensorList<GPUBackend> &tl) {
           return py::reinterpret_borrow<py::object>(
-              PyLong_FromVoidPtr(unsafe_raw_mutable_data(tl)));
+              PyLong_FromVoidPtr(contiguous_raw_mutable_data(tl)));
         },
       R"code(
       Returns the address of the first element of TensorList.
diff --git a/dali/test/operators/copy.h b/dali/test/operators/copy.h
index 00f926cf4b..5d8c02af0e 100644
--- a/dali/test/operators/copy.h
+++ b/dali/test/operators/copy.h
@@ -34,6 +34,10 @@ class CopyArgumentOp : public Operator<Backend> {
   DISABLE_COPY_MOVE_ASSIGN(CopyArgumentOp);
 
  protected:
+  bool HasContiguousOutputs() const override {
+    return false;
+  }
+
   bool SetupImpl(std::vector<OutputDesc> &output_desc, const Workspace &ws) override {
     return false;
   }
diff --git a/dali/test/operators/dummy_op.h b/dali/test/operators/dummy_op.h
index 22683dad8b..86aa386e49 100644
--- a/dali/test/operators/dummy_op.h
+++ b/dali/test/operators/dummy_op.h
@@ -34,6 +34,10 @@ class DummyOp : public Operator<Backend> {
   DISABLE_COPY_MOVE_ASSIGN(DummyOp);
 
  protected:
+  bool HasContiguousOutputs() const override {
+    return false;
+  }
+
   bool SetupImpl(std::vector<OutputDesc> &output_desc, const Workspace &ws) override {
     return false;
   }
diff --git a/dali/test/operators/exception.h b/dali/test/operators/exception.h
index 6b14113ad1..898fc7cbeb 100644
--- a/dali/test/operators/exception.h
+++ b/dali/test/operators/exception.h
@@ -38,6 +38,10 @@ class ThrowExceptionOp : public Operator<Backend> {
   USE_OPERATOR_MEMBERS();
 
  protected:
+  bool HasContiguousOutputs() const override {
+    return false;
+  }
+
   bool SetupImpl(std::vector<OutputDesc> &output_desc, const Workspace &ws) override {
     return false;
   }
diff --git a/dali/test/operators/passthrough.h b/dali/test/operators/passthrough.h
index c6e3222f6c..5d3f968844 100644
--- a/dali/test/operators/passthrough.h
+++ b/dali/test/operators/passthrough.h
@@ -31,6 +31,10 @@ class PassthroughOp : public Operator<Backend> {
   DISABLE_COPY_MOVE_ASSIGN(PassthroughOp);
 
  protected:
+  bool HasContiguousOutputs() const override {
+    return false;
+  }
+
   bool SetupImpl(std::vector<OutputDesc> &output_desc, const Workspace &ws) override {
     return false;
   }
diff --git a/dali/test/operators/passthrough_with_trace.h b/dali/test/operators/passthrough_with_trace.h
index 8682acbd2b..88028cf2ee 100644
--- a/dali/test/operators/passthrough_with_trace.h
+++ b/dali/test/operators/passthrough_with_trace.h
@@ -33,6 +33,10 @@ class PassthroughWithTraceOp : public Operator<Backend> {
   DISABLE_COPY_MOVE_ASSIGN(PassthroughWithTraceOp);
 
  protected:
+  bool HasContiguousOutputs() const override {
+    return false;
+  }
+
   bool SetupImpl(std::vector<OutputDesc> &output_desc, const Workspace &ws) override {
     return false;
   }
diff --git a/dali/test/operators/string_msg_helper.h b/dali/test/operators/string_msg_helper.h
index efc09bc515..d95d919268 100644
--- a/dali/test/operators/string_msg_helper.h
+++ b/dali/test/operators/string_msg_helper.h
@@ -40,10 +40,6 @@ class StringMsgHelper : public Operator<CPUBackend> {
  protected:
   virtual std::string GetMessage(const OpSpec &spec, const Workspace &ws) = 0;
 
-  bool CanInferOutputs() const override {
-    return true;
-  }
-
   bool SetupImpl(std::vector<OutputDesc> &output_desc, const Workspace &ws) override {
     returned_message_ = GetMessage(spec_, ws);
     output_desc.resize(1);
diff --git a/dali/test/plugins/dummy/dummy.h b/dali/test/plugins/dummy/dummy.h
index 6c0af5430e..c2bb961b4e 100644
--- a/dali/test/plugins/dummy/dummy.h
+++ b/dali/test/plugins/dummy/dummy.h
@@ -33,10 +33,6 @@ class Dummy : public ::dali::Operator<Backend> {
   DISABLE_COPY_MOVE_ASSIGN(Dummy);
 
  protected:
-  bool CanInferOutputs() const override {
-    return true;
-  }
-
   bool SetupImpl(std::vector<::dali::OutputDesc> &output_desc,
                  const ::dali::Workspace &ws) override {
     const auto &input = ws.Input<Backend>(0);
diff --git a/docs/examples/custom_operations/custom_operator/create_a_custom_operator.ipynb b/docs/examples/custom_operations/custom_operator/create_a_custom_operator.ipynb
index f938fca23b..d6961334f7 100644
--- a/docs/examples/custom_operations/custom_operator/create_a_custom_operator.ipynb
+++ b/docs/examples/custom_operations/custom_operator/create_a_custom_operator.ipynb
@@ -31,7 +31,7 @@
     "\n",
     "2. Provide common Setup functions.\n",
     "\n",
-    "The implementation of `CanInferOutputs` and `SetupImpl` can be shared across backends. `SetupImpl` provides the shape and type description of the output based on the input, and `CanInferOutputs` informs the executor that the Operator can provide that output description for the entire batch before executing `RunImpl`."
+    "The implementation `SetupImpl` can be shared across backends. `SetupImpl` provides the shape and type description of the output based on the input; the return value determines whether the executor should allocate the storage for the operator's output before calling `RunImpl`. `HasContiguousOutputs` declares that the outputs of the operator are contiguous - this is usually true, except for operators which shuffle the batch without copying or legacy operators utilizing `SampleWorkspace` - and that's why we can rely on the default implementation."
    ]
   },
   {
@@ -66,10 +66,6 @@
       "  Dummy& operator=(Dummy&&) = delete;\n",
       "\n",
       " protected:\n",
-      "  bool CanInferOutputs() const override {\n",
-      "    return true;\n",
-      "  }\n",
-      "\n",
       "  bool SetupImpl(std::vector<::dali::OutputDesc> &output_desc,\n",
       "                 const ::dali::Workspace &ws) override {\n",
       "    const auto &input = ws.Input<Backend>(0);\n",
@@ -103,7 +99,7 @@
     "\n",
     "In RunImpl we obtain access to the entire batch that is processed. We get the reference to the CPU thread pool from the workspace `ws` and create tasks that will copy samples from input to output in parallel. The tasks will be ordered by the thread pool from the longest to the shortest, based on the tensor size, to best utilize the worker threads.\n",
     "\n",
-    "The outputs are already allocated as we provided the SetupImpl and CanInferOutputs functions."
+    "The outputs are already allocated as we provided the SetupImpl function."
    ]
   },
   {
diff --git a/docs/examples/custom_operations/custom_operator/customdummy/dummy.h b/docs/examples/custom_operations/custom_operator/customdummy/dummy.h
index 74aa53a7a9..9eef36359f 100644
--- a/docs/examples/custom_operations/custom_operator/customdummy/dummy.h
+++ b/docs/examples/custom_operations/custom_operator/customdummy/dummy.h
@@ -21,10 +21,6 @@ class Dummy : public ::dali::Operator<Backend> {
   Dummy& operator=(Dummy&&) = delete;
 
  protected:
-  bool CanInferOutputs() const override {
-    return true;
-  }
-
   bool SetupImpl(std::vector<::dali::OutputDesc> &output_desc,
                  const ::dali::Workspace &ws) override {
     const auto &input = ws.Input<Backend>(0);
diff --git a/docs/examples/custom_operations/custom_operator/naive_histogram/naive_histogram.h b/docs/examples/custom_operations/custom_operator/naive_histogram/naive_histogram.h
index 69b9595732..44d050012e 100644
--- a/docs/examples/custom_operations/custom_operator/naive_histogram/naive_histogram.h
+++ b/docs/examples/custom_operations/custom_operator/naive_histogram/naive_histogram.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// Copyright (c) 2023-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -41,11 +41,6 @@ class NaiveHistogram : public ::dali::Operator<Backend> {
   NaiveHistogram &operator=(NaiveHistogram &&) = delete;
 
  protected:
-  bool CanInferOutputs() const override {
-    return true;
-  }
-
-
   bool SetupImpl(std::vector<::dali::OutputDesc> &output_desc,
                  const ::dali::Workspace &ws) override {
     /*
diff --git a/plugins/video/pkg_src/src/decoder/video_decoder_mixed.h b/plugins/video/pkg_src/src/decoder/video_decoder_mixed.h
index 17cddadf6a..0da6ea4f1c 100644
--- a/plugins/video/pkg_src/src/decoder/video_decoder_mixed.h
+++ b/plugins/video/pkg_src/src/decoder/video_decoder_mixed.h
@@ -37,10 +37,6 @@ class VideoDecoderMixed : public dali::Operator<dali::MixedBackend> {
       }
     }
 
-  bool CanInferOutputs() const override {
-    return true;
-  }
-
   void ValidateInput(const dali::Workspace &ws) {
     const auto &input = ws.Input<dali::CPUBackend>(0);
     DALI_ENFORCE(input.type() == dali::DALI_UINT8,

From 0d55aa05ef11b2dd6697b6dc0a7d1135661e0d26 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20D=C4=85bek?= <mdabek@nvidia.com>
Date: Thu, 17 Oct 2024 11:35:18 +0200
Subject: [PATCH 20/29] Deps update 10 2024 (#5670)

Dependency update 10-2024:
- DALI_deps moved to the latest commit: c7e3e7b996b0a1b19f5e435d32e64c20d9a28a42
- Google Benchmark updated to v1.9.0
- DLPack updated to v1.0
- FFTS updated to the top of the tree: Sep, 2024
- GoogleTest updated to v1.15.2
- pybind11 updated to v2.13.6
- RapidJSON updated to the top of tree: Sep 24, 2024

Signed-off-by: Marek Dabek <mdabek@nvidia.com>
---
 DALI_DEPS_VERSION                             |  2 +-
 conda/third_party/jpeg_turbo/recipe/meta.yaml |  2 +-
 dali/kernels/erase/erase_gpu_test.cu          |  1 +
 dali/python/bundle-wheel.sh                   | 15 +++++++++++++++
 dali/test/dali_test.h                         |  1 +
 third_party/README.rst                        | 12 ++++++------
 third_party/benchmark                         |  2 +-
 third_party/dlpack                            |  2 +-
 third_party/ffts                              |  2 +-
 third_party/googletest                        |  2 +-
 third_party/pybind11                          |  2 +-
 third_party/rapidjson                         |  2 +-
 12 files changed, 31 insertions(+), 14 deletions(-)

diff --git a/DALI_DEPS_VERSION b/DALI_DEPS_VERSION
index 5899248cc0..4ec8cae34a 100644
--- a/DALI_DEPS_VERSION
+++ b/DALI_DEPS_VERSION
@@ -1 +1 @@
-6d93550b1340c2010fc356b1e16ab6e4dfdc27c0
+c7e3e7b996b0a1b19f5e435d32e64c20d9a28a42
diff --git a/conda/third_party/jpeg_turbo/recipe/meta.yaml b/conda/third_party/jpeg_turbo/recipe/meta.yaml
index fd4c03457c..84b632761c 100644
--- a/conda/third_party/jpeg_turbo/recipe/meta.yaml
+++ b/conda/third_party/jpeg_turbo/recipe/meta.yaml
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-{% set build_version = "3.0.3" %}
+{% set build_version = "3.0.90" %}
 
   package:
     name: jpeg-turbo
diff --git a/dali/kernels/erase/erase_gpu_test.cu b/dali/kernels/erase/erase_gpu_test.cu
index 58b11881ce..49f4139ec9 100644
--- a/dali/kernels/erase/erase_gpu_test.cu
+++ b/dali/kernels/erase/erase_gpu_test.cu
@@ -17,6 +17,7 @@
 #include <complex>
 #include <tuple>
 #include <vector>
+#include <iomanip>
 
 #include "dali/kernels/common/utils.h"
 #include "dali/kernels/erase/erase_gpu.h"
diff --git a/dali/python/bundle-wheel.sh b/dali/python/bundle-wheel.sh
index 4d290f889a..d1f5fef27a 100755
--- a/dali/python/bundle-wheel.sh
+++ b/dali/python/bundle-wheel.sh
@@ -121,6 +121,21 @@ DEPS_LIST=(
     "${DEPS_PATH}/lib/libcfitsio.so.4"
     "${DEPS_PATH}/lib/libaws-cpp-sdk-core.so"
     "${DEPS_PATH}/lib/libaws-cpp-sdk-s3.so"
+    "${DEPS_PATH}/lib/libaws-crt-cpp.so"
+    "${DEPS_PATH}/lib/libaws-c-mqtt.so.1.0.0"
+    "${DEPS_PATH}/lib/libaws-c-event-stream.so.1.0.0"
+    "${DEPS_PATH}/lib/libaws-c-common.so.1.0.0"
+    "${DEPS_PATH}/lib/libaws-c-common.so.1"
+    "${DEPS_PATH}/lib/libaws-c-sdkutils.so.1.0.0"
+    "${DEPS_PATH}/lib/libaws-c-io.so.1.0.0"
+    "${DEPS_PATH}/lib/libaws-c-cal.so.1.0.0"
+    "${DEPS_PATH}/lib/libaws-c-compression.so.1.0.0"
+    "${DEPS_PATH}/lib/libaws-c-http.so.1.0.0"
+    "${DEPS_PATH}/lib/libaws-c-auth.so.1.0.0"
+    "${DEPS_PATH}/lib/libaws-checksums.so.1.0.0"
+    "${DEPS_PATH}/lib/libaws-c-s3.so.1.0.0"
+    "${DEPS_PATH}/lib/libaws-c-s3.so.0unstable"
+    "${DEPS_PATH}/lib/libs2n.so.1"
     "lib/libcvcuda.so.0"
     "lib/libnvcv_types.so.0"
     # cvcuda adds _d suffix to lib names for debug builds
diff --git a/dali/test/dali_test.h b/dali/test/dali_test.h
index 9b0cc66087..cc76e2fdfa 100644
--- a/dali/test/dali_test.h
+++ b/dali/test/dali_test.h
@@ -25,6 +25,7 @@
 #include <utility>
 #include <vector>
 #include <numeric>
+#include <iomanip>
 
 #include "dali/core/common.h"
 #include "dali/core/error_handling.h"
diff --git a/third_party/README.rst b/third_party/README.rst
index 88409c18d8..e89021b6f5 100644
--- a/third_party/README.rst
+++ b/third_party/README.rst
@@ -5,7 +5,7 @@ This part of the repository contains extra dependencies required to build DALI,
 +------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------+-------------------------------------------------------------------------------------------------------------------+
 | Repository                                                       | Version                                                                                                                   | License                                                                                                           |
 +==================================================================+===========================================================================================================================+===================================================================================================================+
-| `Google Benchmark <https://github.com/google/benchmark>`_        | `1.8.4 <https://github.com/google/benchmark/releases/tag/v1.8.4>`_                                                        | `Apache License 2.0 <https://github.com/google/benchmark/blob/master/LICENSE>`_                                   |
+| `Google Benchmark <https://github.com/google/benchmark>`_        | `1.9.0 <https://github.com/google/benchmark/releases/tag/v1.9.0>`_                                                        | `Apache License 2.0 <https://github.com/google/benchmark/blob/master/LICENSE>`_                                   |
 +------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------+-------------------------------------------------------------------------------------------------------------------+
 | `Boost Preprocessor <https://github.com/boostorg/preprocessor>`_ | `1.85.0 <https://github.com/boostorg/preprocessor/releases/tag/boost-1.85.0>`_                                            | `Boost Software License 1.0 <https://github.com/boostorg/boost/blob/master/LICENSE_1_0.txt>`_                     |
 +------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------+-------------------------------------------------------------------------------------------------------------------+
@@ -15,15 +15,15 @@ This part of the repository contains extra dependencies required to build DALI,
 +------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------+-------------------------------------------------------------------------------------------------------------------+
 | `CV-CUDA <https://github.com/CVCUDA/CV-CUDA>`_                   | `0.7.0 beta <https://github.com/CVCUDA/CV-CUDA/releases/tag/v0.8.0-beta>`_                                                | `Apache License 2.0 <https://github.com/CVCUDA/CV-CUDA/blob/main/LICENSE.md>`_                                    |
 +------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------+-------------------------------------------------------------------------------------------------------------------+
-| `DLPack <https://github.com/dmlc/dlpack>`_                       | `0.8 <https://github.com/dmlc/dlpack/releases/tag/v0.8>`_                                                                 | `Apache License 2.0 <https://github.com/dmlc/dlpack/blob/main/LICENSE>`_                                          |
+| `DLPack <https://github.com/dmlc/dlpack>`_                       | `1.0 <https://github.com/dmlc/dlpack/releases/tag/v1.0>`_                                                                 | `Apache License 2.0 <https://github.com/dmlc/dlpack/blob/main/LICENSE>`_                                          |
 +------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------+-------------------------------------------------------------------------------------------------------------------+
-| `FFTS <https://github.com/JanuszL/ffts>`_                        | `Custom fork top-of-tree (Jan 23, 2020) <https://github.com/JanuszL/ffts/tree/c9a9f61a60505751cac385ed062ce2720bdf07d4>`_ | `BSD 3-Clause License <https://github.com/JanuszL/ffts/blob/master/COPYRIGHT>`_                                   |
+| `FFTS <https://github.com/JanuszL/ffts>`_                        | `Custom fork top-of-tree (Sep 6, 2024) <https://github.com/JanuszL/ffts/tree/95489ebcd6fc136c6a76f50f57d43e6072e2bd38>`_  | `BSD 3-Clause License <https://github.com/JanuszL/ffts/blob/master/COPYRIGHT>`_                                   |
 +------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------+-------------------------------------------------------------------------------------------------------------------+
-| `GoogleTest <https://github.com/google/googletest>`_             | `1.14.0 <https://github.com/google/googletest/releases/tag/v1.14.0>`_                                                     | `BSD 3-Clause License <https://github.com/google/googletest/blob/master/LICENSE>`_                                |
+| `GoogleTest <https://github.com/google/googletest>`_             | `1.15.2 <https://github.com/google/googletest/releases/tag/v1.15.2>`_                                                     | `BSD 3-Clause License <https://github.com/google/googletest/blob/master/LICENSE>`_                                |
 +------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------+-------------------------------------------------------------------------------------------------------------------+
-| `pybind11 <https://github.com/pybind/pybind11>`_                 | `2.13.1 <https://github.com/pybind/pybind11/releases/tag/v2.13.1>`_                                                       | `BSD 3-Clause License <https://github.com/pybind/pybind11/blob/master/LICENSE>`_                                  |
+| `pybind11 <https://github.com/pybind/pybind11>`_                 | `2.13.6 <https://github.com/pybind/pybind11/releases/tag/v2.13.6>`_                                                       | `BSD 3-Clause License <https://github.com/pybind/pybind11/blob/master/LICENSE>`_                                  |
 +------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------+-------------------------------------------------------------------------------------------------------------------+
-| `RapidJSON <https://github.com/Tencent/rapidjson>`_              | `Top-of-tree (Apr 9, 2024) <https://github.com/Tencent/rapidjson/commit/3f73edae00aba5b0112a80b4d41e6f1ff7d92a3d>`_       | `MIT License, BSD 3-Clause License, JSON License <https://github.com/Tencent/rapidjson/blob/master/license.txt>`_ |
+| `RapidJSON <https://github.com/Tencent/rapidjson>`_              | `Top-of-tree (Sep 24, 2024) <https://github.com/Tencent/rapidjson/commit/815e6e7e7e14be44a6c15d9aefed232ff064cad0>`_      | `MIT License, BSD 3-Clause License, JSON License <https://github.com/Tencent/rapidjson/blob/master/license.txt>`_ |
 +------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------+-------------------------------------------------------------------------------------------------------------------+
 | `black <https://github.com/psf/black>`_                          | `24.4.2 <https://pypi.org/project/black/24.4.2/>`_                                                                        | `MIT License <https://github.com/psf/black/blob/main/LICENSE>`_                                                   |
 +------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------+-------------------------------------------------------------------------------------------------------------------+
diff --git a/third_party/benchmark b/third_party/benchmark
index a4cf155615..12235e2465 160000
--- a/third_party/benchmark
+++ b/third_party/benchmark
@@ -1 +1 @@
-Subproject commit a4cf155615c63e019ae549e31703bf367df5b471
+Subproject commit 12235e24652fc7f809373e7c11a5f73c5763fc4c
diff --git a/third_party/dlpack b/third_party/dlpack
index 365b823ced..bbd2f4d324 160000
--- a/third_party/dlpack
+++ b/third_party/dlpack
@@ -1 +1 @@
-Subproject commit 365b823cedb281cd0240ca601aba9b78771f91a3
+Subproject commit bbd2f4d32427e548797929af08cfe2a9cbb3cf12
diff --git a/third_party/ffts b/third_party/ffts
index c9a9f61a60..95489ebcd6 160000
--- a/third_party/ffts
+++ b/third_party/ffts
@@ -1 +1 @@
-Subproject commit c9a9f61a60505751cac385ed062ce2720bdf07d4
+Subproject commit 95489ebcd6fc136c6a76f50f57d43e6072e2bd38
diff --git a/third_party/googletest b/third_party/googletest
index f8d7d77c06..b514bdc898 160000
--- a/third_party/googletest
+++ b/third_party/googletest
@@ -1 +1 @@
-Subproject commit f8d7d77c06936315286eb55f8de22cd23c188571
+Subproject commit b514bdc898e2951020cbdca1304b75f5950d1f59
diff --git a/third_party/pybind11 b/third_party/pybind11
index 941f45bcb5..a2e59f0e70 160000
--- a/third_party/pybind11
+++ b/third_party/pybind11
@@ -1 +1 @@
-Subproject commit 941f45bcb51457884fa1afd6e24a67377d70f75c
+Subproject commit a2e59f0e7065404b44dfe92a28aca47ba1378dc4
diff --git a/third_party/rapidjson b/third_party/rapidjson
index ab1842a2da..815e6e7e7e 160000
--- a/third_party/rapidjson
+++ b/third_party/rapidjson
@@ -1 +1 @@
-Subproject commit ab1842a2dae061284c0a62dca1cc6d5e7e37e346
+Subproject commit 815e6e7e7e14be44a6c15d9aefed232ff064cad0

From ef604c4a0045019b98026b1f917cb600cb9ab14d Mon Sep 17 00:00:00 2001
From: Janusz Lisiecki <39967756+JanuszL@users.noreply.github.com>
Date: Thu, 17 Oct 2024 13:03:35 +0200
Subject: [PATCH 21/29] Fix the redundant usage of pinned memory in the numpy
 cpu reader (#5678)

- the numpy cpu reader uses pinned memory when the don't use mmap option
  is on. This PR fixes this and adds a corresponding test

Signed-off-by: Janusz Lisiecki <jlisiecki@nvidia.com>
---
 dali/operators/reader/numpy_reader_op.cc | 1 +
 dali/test/python/test_dali_cpu_only.py   | 1 +
 2 files changed, 2 insertions(+)

diff --git a/dali/operators/reader/numpy_reader_op.cc b/dali/operators/reader/numpy_reader_op.cc
index e86c7e4a5a..9c5be56913 100644
--- a/dali/operators/reader/numpy_reader_op.cc
+++ b/dali/operators/reader/numpy_reader_op.cc
@@ -314,6 +314,7 @@ void NumpyReaderCPU::Prefetch() {
       }
       target->data.ShareData(tmp_mem, target->nbytes, false, target->shape, target->type, -1);
     } else {
+      if (!target->data.has_data()) target->data.set_pinned(false);
       target->data.Resize(target->shape, target->type);
       auto data_ptr = static_cast<uint8_t*>(target->data.raw_mutable_data());
       Index ret = target->current_file->Read(data_ptr, target->nbytes);
diff --git a/dali/test/python/test_dali_cpu_only.py b/dali/test/python/test_dali_cpu_only.py
index ef94cf208c..002588b0a2 100644
--- a/dali/test/python/test_dali_cpu_only.py
+++ b/dali/test/python/test_dali_cpu_only.py
@@ -1001,6 +1001,7 @@ def get_labels():
 def test_numpy_reader_cpu():
     with setup_test_numpy_reader_cpu() as test_data_root:
         check_no_input(fn.readers.numpy, file_root=test_data_root)
+        check_no_input(fn.readers.numpy, file_root=test_data_root, dont_use_mmap=True)
 
 
 @attr("pytorch")

From 55de733b9f5d26aa0f602ecea0bf5acbb9620b23 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20D=C4=85bek?= <mdabek@nvidia.com>
Date: Mon, 21 Oct 2024 17:05:40 +0200
Subject: [PATCH 22/29] Update of FFmpeg to n7.1 (#5681)

* Update of FFmpeg to n7.1

Signed-off-by: Marek Dabek <mdabek@nvidia.com>
---
 conda/third_party/dali_ffmpeg/recipe/meta.yaml       | 8 ++++----
 dali/operators/reader/loader/video/frames_decoder.cc | 1 +
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/conda/third_party/dali_ffmpeg/recipe/meta.yaml b/conda/third_party/dali_ffmpeg/recipe/meta.yaml
index 8316b44d69..416f74a0f9 100644
--- a/conda/third_party/dali_ffmpeg/recipe/meta.yaml
+++ b/conda/third_party/dali_ffmpeg/recipe/meta.yaml
@@ -12,16 +12,16 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-{% set build_version = "7.0.2" %}
+{% set build_version = "7.1" %}
 
   package:
     name: dali-ffmpeg
     version: {{ build_version }}
 
   source:
-    fn: FFmpeg-n7.0.2.tar.gz
-    url: https://developer.download.nvidia.com/compute/redist/nvidia-dali/FFmpeg-n7.0.2.tar.gz
-    sha256: 5eb46d18d664a0ccadf7b0adee03bd3b7fa72893d667f36c69e202a807e6d533
+    fn: FFmpeg-n7.1.tar.gz
+    url: https://developer.download.nvidia.com/compute/redist/nvidia-dali/FFmpeg-n7.1.tar.gz
+    sha256: 7ddad2d992bd250a6c56053c26029f7e728bebf0f37f80cf3f8a0e6ec706431a
 
   build:
     number: 0
diff --git a/dali/operators/reader/loader/video/frames_decoder.cc b/dali/operators/reader/loader/video/frames_decoder.cc
index f2dc1b542d..99c08e1121 100644
--- a/dali/operators/reader/loader/video/frames_decoder.cc
+++ b/dali/operators/reader/loader/video/frames_decoder.cc
@@ -527,6 +527,7 @@ void FramesDecoder::SeekFrame(int frame_id) {
 
   // Seeking clears av buffers, so reset flush state info
   if (flush_state_) {
+    while (ReadFlushFrame(nullptr, false)) {}
     flush_state_ = false;
   }
 

From 6b3f5d443942e461dd6a64edffb1e3fea372a98c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Micha=C5=82=20Zientkiewicz?= <michalz@nvidia.com>
Date: Tue, 22 Oct 2024 14:47:05 +0200
Subject: [PATCH 23/29] Make black and flake8 run independently. (#5685)

Signed-off-by: Michal Zientkiewicz <michalz@nvidia.com>
---
 cmake/lint.cmake | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/cmake/lint.cmake b/cmake/lint.cmake
index 7352c0182a..ddb1602618 100644
--- a/cmake/lint.cmake
+++ b/cmake/lint.cmake
@@ -67,10 +67,9 @@ add_custom_target(lint-python-flake
         COMMENT
           "Performing Python linter check"
 )
-add_dependencies(lint-python-flake lint-python-black)
 
 add_custom_target(lint-python)
-add_dependencies(lint-python lint-python-flake lint-python-bandit)
+add_dependencies(lint-python lint-python-black lint-python-flake lint-python-bandit)
 
 add_custom_target(lint)
 add_dependencies(lint lint-cpp lint-python)

From 9a800501b11883d703b1ad2c7af6c5bab7e8278c Mon Sep 17 00:00:00 2001
From: Janusz Lisiecki <39967756+JanuszL@users.noreply.github.com>
Date: Wed, 23 Oct 2024 10:21:33 +0200
Subject: [PATCH 24/29] Fix inverted mmap inside webdataset reader (#5683)

- fixes the wrong usage of mmap when the user is not asking for it inside
webdataset

Signed-off-by: Janusz Lisiecki <jlisiecki@nvidia.com>
---
 dali/operators/reader/loader/webdataset_loader.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/dali/operators/reader/loader/webdataset_loader.cc b/dali/operators/reader/loader/webdataset_loader.cc
index 118df59ff3..b005f08bcf 100644
--- a/dali/operators/reader/loader/webdataset_loader.cc
+++ b/dali/operators/reader/loader/webdataset_loader.cc
@@ -382,14 +382,14 @@ void WebdatasetLoader::PrepareMetadataImpl() {
 
   FileStream::Options opts;
   opts.read_ahead = read_ahead_;
-  opts.use_mmap = copy_read_data_;
+  opts.use_mmap = !copy_read_data_;
   opts.use_odirect = false;
 
   // initializing all the readers
   wds_shards_.reserve(paths_.size());
   for (auto& path : paths_) {
     // If an actual URI, disable mmap
-    opts.use_mmap = copy_read_data_;
+    opts.use_mmap = !copy_read_data_;
     wds_shards_.emplace_back(FileStream::Open(path, opts));
   }
 

From 3b4573a3b23a8aa8eddbfe5c92c5b65e52d7a846 Mon Sep 17 00:00:00 2001
From: Janusz Lisiecki <39967756+JanuszL@users.noreply.github.com>
Date: Wed, 23 Oct 2024 18:33:40 +0200
Subject: [PATCH 25/29] Add an ability to rewind at the end of the video
 (#5676)

- in some cases after seeking to the last key frame in the video
  leads to the premature end of decoding as the decoding doesn't start
  and yet DALI waits for frames. This PR adds a rewind to one before
  the last (and so on), frame to make sure we can decode the last frames

Signed-off-by: Janusz Lisiecki <jlisiecki@nvidia.com>
---
 dali/operators/reader/loader/video_loader.cc | 42 +++++++++++++-------
 1 file changed, 27 insertions(+), 15 deletions(-)

diff --git a/dali/operators/reader/loader/video_loader.cc b/dali/operators/reader/loader/video_loader.cc
index a9d280fb75..0a5d2142f6 100644
--- a/dali/operators/reader/loader/video_loader.cc
+++ b/dali/operators/reader/loader/video_loader.cc
@@ -576,29 +576,37 @@ void VideoLoader::read_file() {
   int frames_send = 0;
   VidReqStatus dec_status = VidReqStatus::REQ_IN_PROGRESS;
 
-  while (av_read_frame(file.fmt_ctx_.get(), &raw_pkt) >= 0) {
-    auto pkt = pkt_ptr(&raw_pkt, av_packet_unref);
+  int read_frame_ret = 0;
+  while ((read_frame_ret = av_read_frame(file.fmt_ctx_.get(), &raw_pkt)) >= 0 ||
+          dec_status == VidReqStatus::REQ_NOT_STARTED) {
+    pkt_ptr pkt = {nullptr, av_packet_unref};
+    int64_t frame = 0;
 
-    stats_.bytes_read += pkt->size;
-    stats_.packets_read++;
+    if (read_frame_ret >= 0) {
+      pkt = pkt_ptr(&raw_pkt, av_packet_unref);
 
-    if (pkt->stream_index != file.vid_stream_idx_) {
-        continue;
-    }
+      stats_.bytes_read += pkt->size;
+      stats_.packets_read++;
 
-    auto frame = av_rescale_q(pkt->pts - file.start_time_,
-                              file.stream_base_,
-                              file.frame_base_);
-    LOG_LINE << "Frame candidate " << frame << " (for " << req.frame  <<" )...\n";
+      if (pkt->stream_index != file.vid_stream_idx_) {
+          continue;
+      }
 
-    file.last_frame_ = frame;
-    key = (pkt->flags & AV_PKT_FLAG_KEY) != 0;
+      frame = av_rescale_q(pkt->pts - file.start_time_,
+                                file.stream_base_,
+                                file.frame_base_);
+      LOG_LINE << "Frame candidate " << frame << " (for " << req.frame  <<" )...\n";
+
+      file.last_frame_ = frame;
+      key = (pkt->flags & AV_PKT_FLAG_KEY) != 0;
+    }
 
     // if decoding hasn't produced any frames after providing kStartupFrameThreshold frames,
     // or we are at next key frame
     if (last_key_frame != -1 &&
         ((key && last_key_frame != frame && last_key_frame != 0) ||
-          frames_send > kStartupFrameThreshold) &&
+          frames_send > kStartupFrameThreshold ||
+          read_frame_ret < 0) &&
         dec_status == VidReqStatus::REQ_NOT_STARTED) {
         if (last_key_frame <= 0) {
           if (vid_decoder_) {
@@ -617,7 +625,11 @@ void VideoLoader::read_file() {
         --previous_last_key_frame;
       }
       LOG_LINE << "Decoding not started, seek to preceding key frame, "
-               << "current frame " << frame
+               << " frames_send vs kStartupFrameThreshold: "
+               << frames_send << " / " << kStartupFrameThreshold
+               << ", av_read_frame result: "
+               << read_frame_ret
+               << ", current frame " << frame
                << ", look for a key frame before " << previous_last_key_frame
                << ", is_key " << key << std::endl;
       seek(file, previous_last_key_frame);

From af1491936561f7213b3a96804b94433573f99acc Mon Sep 17 00:00:00 2001
From: Janusz Lisiecki <39967756+JanuszL@users.noreply.github.com>
Date: Thu, 24 Oct 2024 09:21:10 +0200
Subject: [PATCH 26/29] Make sure that the proper video stream index is used by
 the GPU decoder (#5682)

- fixes the use of 0 index stream inside the GPU decoder by properly
  obtained index

Signed-off-by: Janusz Lisiecki <jlisiecki@nvidia.com>
---
 dali/operators/input/video_input_cpu.cc       |  2 ++
 dali/operators/input/video_input_mixed.cc     |  2 ++
 .../reader/loader/video/frames_decoder.h      |  5 ++--
 .../reader/loader/video/frames_decoder_gpu.cc | 13 ++++++---
 dali/test/python/input/test_video.py          | 27 ++++++++++++++++++-
 5 files changed, 42 insertions(+), 7 deletions(-)

diff --git a/dali/operators/input/video_input_cpu.cc b/dali/operators/input/video_input_cpu.cc
index 12beeb3ab5..4baf4df22c 100644
--- a/dali/operators/input/video_input_cpu.cc
+++ b/dali/operators/input/video_input_cpu.cc
@@ -24,6 +24,8 @@ void VideoInput<CPUBackend, dali::FramesDecoder>::CreateDecoder(const Workspace
   auto data = reinterpret_cast<const char *>(sample.data<uint8_t>());
   size_t size = sample.shape().num_elements();
   this->frames_decoders_[0] = std::make_unique<dali::FramesDecoder>(data, size, false);
+  DALI_ENFORCE(this->frames_decoders_[0]->IsValid(),
+               "Failed to create video decoder for provided video data");
 }
 
 
diff --git a/dali/operators/input/video_input_mixed.cc b/dali/operators/input/video_input_mixed.cc
index d0cca64b00..b69244c947 100644
--- a/dali/operators/input/video_input_mixed.cc
+++ b/dali/operators/input/video_input_mixed.cc
@@ -25,6 +25,8 @@ void VideoInput<MixedBackend, dali::FramesDecoderGpu>::CreateDecoder(const Works
   size_t size = sample.shape().num_elements();
   this->frames_decoders_[0] = std::make_unique<dali::FramesDecoderGpu>(data, size, ws.stream(),
                                                                        false);
+  DALI_ENFORCE(this->frames_decoders_[0]->IsValid(),
+               "Failed to create video decoder for provided video data");
 }
 
 
diff --git a/dali/operators/reader/loader/video/frames_decoder.h b/dali/operators/reader/loader/video/frames_decoder.h
index 75580b2691..c6742e0655 100644
--- a/dali/operators/reader/loader/video/frames_decoder.h
+++ b/dali/operators/reader/loader/video/frames_decoder.h
@@ -217,6 +217,9 @@ class DLL_PUBLIC FramesDecoder {
 
   bool is_full_range_ = false;
 
+  // False when the file doesn't have any correct content or doesn't have a valid video stream
+  bool is_valid_ = false;
+
   std::optional<bool> zero_latency_ = {};
 
  private:
@@ -275,8 +278,6 @@ class DLL_PUBLIC FramesDecoder {
   int channels_ = 3;
   bool flush_state_ = false;
   bool is_vfr_ = false;
-  // False when the file doesn't have any correct content or doesn't have valid video stream
-  bool is_valid_ = false;
 
   const std::string filename_ = {};
   std::optional<MemoryVideoFile> memory_video_file_ = {};
diff --git a/dali/operators/reader/loader/video/frames_decoder_gpu.cc b/dali/operators/reader/loader/video/frames_decoder_gpu.cc
index 476b36c53f..1331c6453d 100644
--- a/dali/operators/reader/loader/video/frames_decoder_gpu.cc
+++ b/dali/operators/reader/loader/video/frames_decoder_gpu.cc
@@ -331,7 +331,8 @@ void FramesDecoderGpu::InitBitStreamFilter() {
   }
 
   DALI_ENFORCE(
-    avcodec_parameters_copy(bsfc_->par_in, av_state_->ctx_->streams[0]->codecpar) >= 0,
+    avcodec_parameters_copy(bsfc_->par_in,
+                            av_state_->ctx_->streams[av_state_->stream_id_]->codecpar) >= 0,
     "Unable to copy bit stream filter parameters");
   DALI_ENFORCE(
     av_bsf_init(bsfc_) >= 0,
@@ -364,7 +365,11 @@ void FramesDecoderGpu::InitGpuParser() {
   InitBitStreamFilter();
 
   filtered_packet_ = av_packet_alloc();
-  DALI_ENFORCE(filtered_packet_, "Could not allocate av packet");
+  if (!filtered_packet_) {
+    DALI_WARN(make_string("Could not allocate av packet for \"", Filename(), "\""));
+    is_valid_ = false;
+    return;
+  }
 
   auto codec_type = GetCodecType();
 
@@ -380,8 +385,8 @@ void FramesDecoderGpu::InitGpuParser() {
   parser_info.pfnDecodePicture = frame_dec_gpu_impl::process_picture_decode;
   parser_info.pfnDisplayPicture = nullptr;
 
-  auto extradata = av_state_->ctx_->streams[0]->codecpar->extradata;
-  auto extradata_size = av_state_->ctx_->streams[0]->codecpar->extradata_size;
+  auto extradata = av_state_->ctx_->streams[av_state_->stream_id_]->codecpar->extradata;
+  auto extradata_size = av_state_->ctx_->streams[av_state_->stream_id_]->codecpar->extradata_size;
 
   memset(&parser_extinfo, 0, sizeof(parser_extinfo));
   parser_info.pExtVideoInfo = &parser_extinfo;
diff --git a/dali/test/python/input/test_video.py b/dali/test/python/input/test_video.py
index aaa15f5a82..c6c7edaefd 100644
--- a/dali/test/python/input/test_video.py
+++ b/dali/test/python/input/test_video.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 import glob
+import os
 import itertools
 import numpy as np
 import nvidia.dali.fn as fn
@@ -22,7 +23,8 @@
 from nvidia.dali import pipeline_def
 from test_utils import get_dali_extra_path, to_array
 
-filenames = glob.glob(f"{get_dali_extra_path()}/db/video/[cv]fr/*.mp4")
+test_data_root = get_dali_extra_path()
+filenames = glob.glob(f"{test_data_root}/db/video/[cv]fr/*.mp4")
 # filter out HEVC because some GPUs do not support it
 filenames = filter(lambda filename: "hevc" not in filename, filenames)
 # mpeg4 is not yet supported in the CPU operator
@@ -254,3 +256,26 @@ def test_video_input_input_queue(device, n_test_files):
         glob="No data was provided to the InputOperator. Make sure to feed it properly.",
     ):
         input_pipe.run()
+
+
+@params(*device_values)
+def test_video_input_audio_stream(device):
+    """
+    Checks if video decoding when audio stream is present
+    """
+    input_name = "VIDEO_INPUT"
+
+    input_pipe = video_input_pipeline(
+        input_name=input_name,
+        batch_size=3,
+        sequence_length=4,
+        device=device,
+        **common_pipeline_params,
+    )
+
+    filename = os.path.join(test_data_root, "db", "video", "sintel", "sintel_trailer-720p.mp4")
+    test_file = np.fromfile(filename, dtype=np.uint8)
+    input_pipe.build()
+    input_pipe.feed_input(input_name, np.array([[test_file]]))
+
+    input_pipe.run()

From 52d314cd33068f75273531a092b534e57967b6de Mon Sep 17 00:00:00 2001
From: Janusz Lisiecki <39967756+JanuszL@users.noreply.github.com>
Date: Fri, 25 Oct 2024 09:11:35 +0200
Subject: [PATCH 27/29] Move from deprecated distutils to packaging (#5687)

- converts all calls from distutils to packaging as distutils
  is deprecated and will be removed sooner or later

Signed-off-by: Janusz Lisiecki <jlisiecki@nvidia.com>
---
 Acknowledgements.txt                          |  34 ++
 conda/dali_python_bindings/recipe/meta.yaml   |   2 +
 .../nvidia/dali/_autograph/pyct/gast_util.py  |  14 +-
 .../python/nvidia/dali/plugin/jax/__init__.py |   4 +-
 .../dali/plugin/jax/fn/_jax_function_impl.py  |   4 +-
 .../plugin/numba/experimental/__init__.py     |  14 +-
 dali/python/nvidia/dali/plugin/paddle.py      |   8 +-
 dali/python/nvidia/dali/plugin/tf.py          |  14 +-
 dali/python/setup.py.in                       |   1 +
 .../test/python/autograph/pyct/test_loader.py |   4 +-
 .../test/python/test_dali_tf_dataset_mnist.py |   6 +-
 .../test_dali_tf_dataset_mnist_eager.py       |   4 +-
 .../test_dali_tf_dataset_mnist_graph.py       |  12 +-
 dali/test/python/test_utils.py                |   6 +-
 dali_tf_plugin/build_dali_tf.sh               |   2 +-
 dali_tf_plugin/dali_tf_plugin_install_tool.py |  10 +-
 dali_tf_plugin/dali_tf_plugin_utils.py        |   4 +-
 dali_tf_plugin/setup.py.in                    |   3 +-
 docker/Dockerfile.customopbuilder.clean       |   1 +
 .../use_cases/paddle/resnet50/utils/config.py | 489 ++++++++++--------
 .../tensorflow/resnet-n/nvutils/common.py     |   6 +-
 .../tensorflow/resnet-n/nvutils/hvd_patch.py  |   4 +-
 .../tensorflow/resnet-n/nvutils/runner.py     |  10 +-
 .../tensorflow/resnet-n/nvutils/runner_ctl.py |   8 +-
 qa/TL1_tensorflow_dataset/test_impl.sh        |   8 +-
 25 files changed, 401 insertions(+), 271 deletions(-)

diff --git a/Acknowledgements.txt b/Acknowledgements.txt
index 5ab2b5ce7a..6cae080559 100644
--- a/Acknowledgements.txt
+++ b/Acknowledgements.txt
@@ -4410,3 +4410,37 @@ products or services of Licensee, or any third party.
 8. By copying, installing or otherwise using Python, Licensee
 agrees to be bound by the terms and conditions of this License
 Agreement.
+
+==============================================================================
+str2bool
+
+
+BSD 3-Clause License
+
+Copyright (c) 2017, SymonSoft
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+* Redistributions of source code must retain the above copyright notice, this
+  list of conditions and the following disclaimer.
+
+* Redistributions in binary form must reproduce the above copyright notice,
+  this list of conditions and the following disclaimer in the documentation
+  and/or other materials provided with the distribution.
+
+* Neither the name of the copyright holder nor the names of its
+  contributors may be used to endorse or promote products derived from
+  this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/conda/dali_python_bindings/recipe/meta.yaml b/conda/dali_python_bindings/recipe/meta.yaml
index 9ccd2dca1b..a15e07c203 100644
--- a/conda/dali_python_bindings/recipe/meta.yaml
+++ b/conda/dali_python_bindings/recipe/meta.yaml
@@ -81,6 +81,7 @@ requirements:
     - astunparse >=1.6.0
     - gast >=0.3.3
     - dm-tree >=0.1.8
+    - packaging
     - nvidia-dali-core{% if environ.get('NVIDIA_DALI_BUILD_FLAVOR', '')|length %}{{"-" + environ.get('NVIDIA_DALI_BUILD_FLAVOR', '')}}{% endif %}-cuda{{ environ.get('CUDA_VERSION', '') | replace(".","") }} ={{ environ.get('DALI_CONDA_BUILD_VERSION', '') }}
     - nvidia-nvimagecodec-cuda{{ environ.get('CUDA_VERSION', '') | replace(".","") }}
   run:
@@ -94,6 +95,7 @@ requirements:
     - astunparse >=1.6.0
     - gast >=0.3.3
     - dm-tree >=0.1.8
+    - packaging
     - nvidia-dali-core{% if environ.get('NVIDIA_DALI_BUILD_FLAVOR', '')|length %}{{"-" + environ.get('NVIDIA_DALI_BUILD_FLAVOR', '')}}{% endif %}-cuda{{ environ.get('CUDA_VERSION', '') | replace(".","") }} ={{ environ.get('DALI_CONDA_BUILD_VERSION', '') }}
     - nvidia-nvimagecodec-cuda{{ environ.get('CUDA_VERSION', '') | replace(".","") }}
 about:
diff --git a/dali/python/nvidia/dali/_autograph/pyct/gast_util.py b/dali/python/nvidia/dali/_autograph/pyct/gast_util.py
index 3424a7be96..107f626416 100644
--- a/dali/python/nvidia/dali/_autograph/pyct/gast_util.py
+++ b/dali/python/nvidia/dali/_autograph/pyct/gast_util.py
@@ -18,9 +18,19 @@
 import functools
 import gast
 
-from distutils.version import LooseVersion
+from packaging.version import Version
 
 
+def convert_to_version(function):
+    """Makes sure that returned function value is a Version object"""
+
+    def wrap_function(*args, **kwargs):
+        return Version(function(*args, **kwargs))
+
+    return wrap_function
+
+
+@convert_to_version
 def get_gast_version():
     """Gast exports `__version__` from 0.5.3 onwards, we need to look it up in a different way."""
     if hasattr(gast, "__version__"):
@@ -76,7 +86,7 @@ def _compat_assign_gast_5(targets, value, type_comment):
     return gast.Assign(targets=targets, value=value, type_comment=type_comment)
 
 
-if get_gast_version() < LooseVersion("0.5"):
+if get_gast_version() < Version("0.5"):
     compat_assign = _compat_assign_gast_4
 else:
     compat_assign = _compat_assign_gast_5
diff --git a/dali/python/nvidia/dali/plugin/jax/__init__.py b/dali/python/nvidia/dali/plugin/jax/__init__.py
index edd29ebc79..08945e3743 100644
--- a/dali/python/nvidia/dali/plugin/jax/__init__.py
+++ b/dali/python/nvidia/dali/plugin/jax/__init__.py
@@ -16,7 +16,7 @@
 
 from . import fn  # noqa: F401
 
-from distutils.version import LooseVersion
+from packaging.version import Version
 from .iterator import DALIGenericIterator, data_iterator
 
 assert (
@@ -24,7 +24,7 @@
 ), "DALI JAX support requires Python 3.8 or above"
 
 
-assert LooseVersion(jax.__version__) >= LooseVersion(
+assert Version(jax.__version__) >= Version(
     "0.4.11"
 ), "DALI JAX support requires JAX 0.4.11 or above"
 
diff --git a/dali/python/nvidia/dali/plugin/jax/fn/_jax_function_impl.py b/dali/python/nvidia/dali/plugin/jax/fn/_jax_function_impl.py
index ccb5126a35..06caa66bbe 100644
--- a/dali/python/nvidia/dali/plugin/jax/fn/_jax_function_impl.py
+++ b/dali/python/nvidia/dali/plugin/jax/fn/_jax_function_impl.py
@@ -14,7 +14,7 @@
 
 from typing import Optional, Protocol, Tuple, Union
 
-from distutils.version import LooseVersion
+from packaging.version import Version
 
 import jax
 import jax.dlpack
@@ -170,7 +170,7 @@ def flip_horizontal(image: jax.Array):
         The transformed function that processes DALI-traced batches (DataNodes).
     """
 
-    if LooseVersion(jax.__version__) < LooseVersion("0.4.16"):
+    if Version(jax.__version__) < Version("0.4.16"):
         raise RuntimeError("DALI `jax_function` requires JAX 0.4.16 or above.")
 
     def decorator(function):
diff --git a/dali/python/nvidia/dali/plugin/numba/experimental/__init__.py b/dali/python/nvidia/dali/plugin/numba/experimental/__init__.py
index 6e7a4b7733..3db1f654fd 100644
--- a/dali/python/nvidia/dali/plugin/numba/experimental/__init__.py
+++ b/dali/python/nvidia/dali/plugin/numba/experimental/__init__.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from distutils.version import LooseVersion
+from packaging.version import Version
 
 from nvidia.dali.pipeline import Pipeline
 from nvidia.dali.data_node import DataNode as _DataNode
@@ -57,8 +57,8 @@
 
 # Minimal version of Numba that is required for Numba GPU operator to work
 minimal_numba_version = {
-    11: LooseVersion("0.55.2"),
-    12: LooseVersion("0.57.0"),
+    11: Version("0.55.2"),
+    12: Version("0.57.0"),
 }
 
 
@@ -196,7 +196,7 @@ def _get_run_fn_gpu(self, run_fn, types, dims):
         for dali_type, ndim in zip(types, dims):
             cuda_arguments.append(numba_types.Array(_to_numba[dali_type], ndim, "C"))
 
-        if LooseVersion(nb.__version__) < LooseVersion("0.57.0"):
+        if Version(nb.__version__) < Version("0.57.0"):
             cres = cuda.compiler.compile_cuda(run_fn, numba_types.void, cuda_arguments)
         else:
             pipeline = Pipeline.current()
@@ -210,7 +210,7 @@ def _get_run_fn_gpu(self, run_fn, types, dims):
         code = run_fn.__code__
         filename = code.co_filename
         linenum = code.co_firstlineno
-        if LooseVersion(nb.__version__) < LooseVersion("0.57.0"):
+        if Version(nb.__version__) < Version("0.57.0"):
             nvvm_options["debug"] = False
             nvvm_options["lineinfo"] = False
             lib, _ = tgt_ctx.prepare_cuda_kernel(
@@ -509,7 +509,7 @@ def __init__(
 
     @staticmethod
     def _check_minimal_numba_version(throw: bool = True):
-        current_version = LooseVersion(nb.__version__)
+        current_version = Version(nb.__version__)
         toolkit_version = cuda.runtime.get_version()
         if toolkit_version[0] not in minimal_numba_version:
             if throw:
@@ -522,7 +522,7 @@ def _check_minimal_numba_version(throw: bool = True):
                 raise RuntimeError(
                     f"Insufficient Numba version. Numba GPU operator "
                     f"requires Numba {str(min_ver)} or higher. "
-                    f"Detected version: {str(LooseVersion(nb.__version__))}."
+                    f"Detected version: {str(Version(nb.__version__))}."
                 )
             else:
                 return False
diff --git a/dali/python/nvidia/dali/plugin/paddle.py b/dali/python/nvidia/dali/plugin/paddle.py
index 04fc1208f4..b3989d69d5 100644
--- a/dali/python/nvidia/dali/plugin/paddle.py
+++ b/dali/python/nvidia/dali/plugin/paddle.py
@@ -18,7 +18,7 @@
 
 import numpy as np
 import paddle
-from distutils.version import LooseVersion
+from packaging.version import Version
 
 from nvidia.dali import types
 from nvidia.dali.backend import TensorListCPU, TensorGPU, TensorListGPU
@@ -26,11 +26,9 @@
 from nvidia.dali.plugin.base_iterator import LastBatchPolicy
 
 if isinstance(paddle.__version__, str):
-    assert LooseVersion(paddle.__version__) == LooseVersion("0.0.0") or LooseVersion(
+    assert Version(paddle.__version__) == Version("0.0.0") or Version(
         paddle.__version__
-    ) >= LooseVersion(
-        "2.0.0"
-    ), "DALI PaddlePaddle support requires Paddle develop or release >= 2.0.0"
+    ) >= Version("2.0.0"), "DALI PaddlePaddle support requires Paddle develop or release >= 2.0.0"
 
 
 dtype_map = {
diff --git a/dali/python/nvidia/dali/plugin/tf.py b/dali/python/nvidia/dali/plugin/tf.py
index 460f25b28b..7ae887e79d 100644
--- a/dali/python/nvidia/dali/plugin/tf.py
+++ b/dali/python/nvidia/dali/plugin/tf.py
@@ -26,7 +26,7 @@
 from nvidia.dali._utils.external_source_impl import _get_generator_from_source_desc
 from nvidia.dali._utils.external_source_impl import _cycle_enabled
 
-from distutils.version import LooseVersion
+from packaging.version import Version
 import warnings
 
 from nvidia.dali_tf_plugin import dali_tf_plugin
@@ -307,29 +307,29 @@ def DALIRawIterator():
 
 
 def _get_tf_version():
-    return LooseVersion(tf.__version__)
+    return Version(tf.__version__)
 
 
-MIN_TENSORFLOW_VERSION = LooseVersion("1.15")
+MIN_TENSORFLOW_VERSION = Version("1.15")
 
 
 def dataset_compatible_tensorflow():
     """Returns ``True`` if current TensorFlow version is compatible with DALIDataset."""
-    return LooseVersion(tf.__version__) >= MIN_TENSORFLOW_VERSION
+    return Version(tf.__version__) >= MIN_TENSORFLOW_VERSION
 
 
 def dataset_inputs_compatible_tensorflow():
     """Returns ``True`` if the current TensorFlow version is compatible with
     experimental.DALIDatasetWithInputs and input Datasets can be used with DALI.
     """
-    return LooseVersion(tf.__version__) >= LooseVersion("2.4.1")
+    return Version(tf.__version__) >= Version("2.4.1")
 
 
 def dataset_distributed_compatible_tensorflow():
     """Returns ``True`` if the tf.distribute APIs for current TensorFlow version are compatible
     with DALIDataset.
     """
-    return LooseVersion(tf.__version__) >= LooseVersion("2.5.0")
+    return Version(tf.__version__) >= Version("2.5.0")
 
 
 def _get_experimental():
@@ -813,7 +813,7 @@ def _as_variant_tensor(self):
                 fail_on_device_mismatch=self._fail_on_device_mismatch,
             )
 
-    if _get_tf_version() < LooseVersion("2.0"):
+    if _get_tf_version() < Version("2.0"):
 
         class _DALIDatasetImpl(dataset_ops.DatasetV1Adapter):
             @functools.wraps(_DALIDatasetV2.__init__)
diff --git a/dali/python/setup.py.in b/dali/python/setup.py.in
index 5014584b59..2c507ce04c 100644
--- a/dali/python/setup.py.in
+++ b/dali/python/setup.py.in
@@ -87,6 +87,7 @@ For more details please check the
           # 1.16 on python 3.12 due to import six.moves
           'six >= 1.16',
           'dm-tree',
+          'packaging',
           @DALI_INSTALL_REQUIRES_NVIMGCODEC@
           ],
      )
diff --git a/dali/test/python/autograph/pyct/test_loader.py b/dali/test/python/autograph/pyct/test_loader.py
index 088e966fa3..bcd33d2c4e 100644
--- a/dali/test/python/autograph/pyct/test_loader.py
+++ b/dali/test/python/autograph/pyct/test_loader.py
@@ -21,7 +21,7 @@
 import unittest
 
 import gast
-from distutils.version import LooseVersion
+from packaging.version import Version
 
 from nvidia.dali._autograph.pyct import ast_util
 from nvidia.dali._autograph.pyct import gast_util
@@ -79,7 +79,7 @@ def test_load_ast(self):
             decorator_list=[],
             returns=None,
             type_comment=None,
-            **{"type_params": []} if gast_util.get_gast_version() >= LooseVersion("0.5.5") else {},
+            **{"type_params": []} if gast_util.get_gast_version() >= Version("0.5.5") else {},
         )
 
         module, source, _ = loader.load_ast(node)
diff --git a/dali/test/python/test_dali_tf_dataset_mnist.py b/dali/test/python/test_dali_tf_dataset_mnist.py
index 999b7524c3..5573142195 100644
--- a/dali/test/python/test_dali_tf_dataset_mnist.py
+++ b/dali/test/python/test_dali_tf_dataset_mnist.py
@@ -21,7 +21,7 @@
 from shutil import rmtree as remove_directory
 import tensorflow as tf
 import tensorflow.compat.v1 as tf_v1
-from distutils.version import StrictVersion
+from packaging.version import Version
 from nose import SkipTest
 
 
@@ -116,7 +116,7 @@ def run_keras_single_device(device="cpu", device_id=0):
 
 
 def graph_model(images, reuse, is_training):
-    if StrictVersion(tf.__version__) >= StrictVersion("2.16"):
+    if Version(tf.__version__) >= Version("2.16"):
         raise SkipTest("TF < 2.16 is required for this test")
     with tf_v1.variable_scope("mnist_net", reuse=reuse):
         images = tf_v1.layers.flatten(images)
@@ -196,7 +196,7 @@ def _run_config(device="cpu", device_id=0):
 
 
 def run_estimators_single_device(device="cpu", device_id=0):
-    if StrictVersion(tf.__version__) < StrictVersion("2.16"):
+    if Version(tf.__version__) < Version("2.16"):
         with tf.device("/{0}:{1}".format(device, device_id)):
             model = keras_model()
         model = tf.keras.estimator.model_to_estimator(
diff --git a/dali/test/python/test_dali_tf_dataset_mnist_eager.py b/dali/test/python/test_dali_tf_dataset_mnist_eager.py
index d84054a62b..dcc93be60d 100644
--- a/dali/test/python/test_dali_tf_dataset_mnist_eager.py
+++ b/dali/test/python/test_dali_tf_dataset_mnist_eager.py
@@ -19,7 +19,7 @@
 from test_utils_tensorflow import skip_for_incompatible_tf, available_gpus
 from nose_utils import raises
 from nose import SkipTest
-from distutils.version import LooseVersion
+from packaging.version import Version
 
 tf.compat.v1.enable_eager_execution()
 
@@ -60,7 +60,7 @@ def test_keras_wrong_placement_cpu():
 def test_keras_multi_gpu_mirrored_strategy():
     # due to compatibility problems between the driver, cuda version and
     # TensorFlow 2.12 test_keras_multi_gpu_mirrored_strategy doesn't work.
-    if LooseVersion(tf.__version__) >= LooseVersion("2.12.0"):
+    if Version(tf.__version__) >= Version("2.12.0"):
         raise SkipTest("This test is not supported for TensorFlow 2.12")
     strategy = tf.distribute.MirroredStrategy(devices=available_gpus())
 
diff --git a/dali/test/python/test_dali_tf_dataset_mnist_graph.py b/dali/test/python/test_dali_tf_dataset_mnist_graph.py
index 88bbe29791..0a1aba5441 100644
--- a/dali/test/python/test_dali_tf_dataset_mnist_graph.py
+++ b/dali/test/python/test_dali_tf_dataset_mnist_graph.py
@@ -16,35 +16,35 @@
 import tensorflow.compat.v1 as tf_v1
 from nose_utils import with_setup, SkipTest, raises
 import test_dali_tf_dataset_mnist as mnist
-from distutils.version import StrictVersion
+from packaging.version import Version
 
 mnist.tf.compat.v1.disable_eager_execution()
 
 
 @with_setup(tf.keras.backend.clear_session)
 def test_keras_single_gpu():
-    if StrictVersion(tf.__version__) >= StrictVersion("2.16"):
+    if Version(tf.__version__) >= Version("2.16"):
         raise SkipTest("TF < 2.16 is required for this test")
     mnist.run_keras_single_device("gpu", 0)
 
 
 @with_setup(tf.keras.backend.clear_session)
 def test_keras_single_other_gpu():
-    if StrictVersion(tf.__version__) >= StrictVersion("2.16"):
+    if Version(tf.__version__) >= Version("2.16"):
         raise SkipTest("TF < 2.16 is required for this test")
     mnist.run_keras_single_device("gpu", 1)
 
 
 @with_setup(tf.keras.backend.clear_session)
 def test_keras_single_cpu():
-    if StrictVersion(tf.__version__) >= StrictVersion("2.16"):
+    if Version(tf.__version__) >= Version("2.16"):
         raise SkipTest("TF < 2.16 is required for this test")
     mnist.run_keras_single_device("cpu", 0)
 
 
 @raises(tf.errors.OpError, "TF device and DALI device mismatch. TF*: CPU, DALI*: GPU for output")
 def test_keras_wrong_placement_gpu():
-    if StrictVersion(tf.__version__) >= StrictVersion("2.16"):
+    if Version(tf.__version__) >= Version("2.16"):
         raise SkipTest("TF < 2.16 is required for this test")
     with tf.device("cpu:0"):
         model = mnist.keras_model()
@@ -55,7 +55,7 @@ def test_keras_wrong_placement_gpu():
 
 @raises(tf.errors.OpError, "TF device and DALI device mismatch. TF*: GPU, DALI*: CPU for output")
 def test_keras_wrong_placement_cpu():
-    if StrictVersion(tf.__version__) >= StrictVersion("2.16"):
+    if Version(tf.__version__) >= Version("2.16"):
         raise SkipTest("TF < 2.16 is required for this test")
     with tf.device("gpu:0"):
         model = mnist.keras_model()
diff --git a/dali/test/python/test_utils.py b/dali/test/python/test_utils.py
index 2603c875b3..a8ce3519a8 100644
--- a/dali/test/python/test_utils.py
+++ b/dali/test/python/test_utils.py
@@ -26,7 +26,7 @@
 import subprocess
 import sys
 import tempfile
-from distutils.version import LooseVersion
+from packaging.version import Version
 from nose_utils import SkipTest
 
 
@@ -945,8 +945,8 @@ def check_numba_compatibility_cpu(if_skip=True):
     # Numba bug:
     # https://github.com/numba/numba/issues/8567
     if platform.processor().lower() in ("arm64", "aarch64", "armv8") and (
-        LooseVersion(numba.__version__) >= LooseVersion("0.57.0")
-        and LooseVersion(numba.__version__) < LooseVersion("0.59.0")
+        Version(numba.__version__) >= Version("0.57.0")
+        and Version(numba.__version__) < Version("0.59.0")
     ):
         if if_skip:
             raise SkipTest()
diff --git a/dali_tf_plugin/build_dali_tf.sh b/dali_tf_plugin/build_dali_tf.sh
index 2419b57b14..3986e2da4c 100755
--- a/dali_tf_plugin/build_dali_tf.sh
+++ b/dali_tf_plugin/build_dali_tf.sh
@@ -27,7 +27,7 @@ DALI_LFLAGS="-L${DALI_STUB_DIR} -ldali"
 TF_CFLAGS=( $($PYTHON -c 'import tensorflow as tf; print(" ".join(tf.sysconfig.get_compile_flags()))') )
 TF_LFLAGS=( $($PYTHON -c 'import tensorflow as tf; print(" ".join(tf.sysconfig.get_link_flags()))') )
 
-CPP_VER=( $($PYTHON -c "import tensorflow as tf; from distutils.version import LooseVersion; print('--std=c++14' if tf.__version__ < LooseVersion('2.10') else '--std=c++17')") )
+CPP_VER=( $($PYTHON -c "import tensorflow as tf; from packaging.version import Version; print('--std=c++14' if Version(tf.__version__) < Version('2.10') else '--std=c++17')") )
 
 # Note: DNDEBUG flag is needed due to issue with TensorFlow custom ops:
 # https://github.com/tensorflow/tensorflow/issues/17316
diff --git a/dali_tf_plugin/dali_tf_plugin_install_tool.py b/dali_tf_plugin/dali_tf_plugin_install_tool.py
index d33e83f423..b4a3078f2c 100644
--- a/dali_tf_plugin/dali_tf_plugin_install_tool.py
+++ b/dali_tf_plugin/dali_tf_plugin_install_tool.py
@@ -28,7 +28,7 @@
     get_tf_build_flags,
 )
 import os
-from distutils.version import StrictVersion, LooseVersion
+from packaging.version import Version
 from pathlib import Path
 import tempfile
 from stubgen import stubgen
@@ -129,7 +129,7 @@ def __init__(self, plugin_dest_dir=None):
         self.can_install_prebuilt = (
             not self.always_build
             and bool(self.tf_compiler)
-            and StrictVersion(self.tf_compiler) >= StrictVersion("5.0")
+            and Version(self.tf_compiler) >= Version("5.0")
             and self.is_compatible_with_prebuilt_bin
             and self.prebuilt_dali_stub is not None
         )
@@ -162,8 +162,8 @@ def __init__(self, plugin_dest_dir=None):
             or self.default_cpp_version == self.tf_compiler
             or not bool(self.tf_compiler)
             or (
-                StrictVersion(self.default_cpp_version) >= StrictVersion("5.0")
-                and StrictVersion(self.tf_compiler) >= StrictVersion("5.0")
+                Version(self.default_cpp_version) >= Version("5.0")
+                and Version(self.tf_compiler) >= Version("5.0")
             )
         )
 
@@ -366,7 +366,7 @@ def build(self):
             lib_path = os.path.join(self.plugin_dest_dir, lib_filename)
 
             # for a newer TF we need to compiler with C++17
-            cpp_ver = "--std=c++14" if self.tf_version < LooseVersion("2.10") else "--std=c++17"
+            cpp_ver = "--std=c++14" if Version(self.tf_version) < Version("2.10") else "--std=c++17"
             # Note: DNDEBUG flag is needed due to issue with TensorFlow custom ops:
             # https://github.com/tensorflow/tensorflow/issues/17316
             # Do not remove it.
diff --git a/dali_tf_plugin/dali_tf_plugin_utils.py b/dali_tf_plugin/dali_tf_plugin_utils.py
index 2266f42c5f..712f56a39d 100644
--- a/dali_tf_plugin/dali_tf_plugin_utils.py
+++ b/dali_tf_plugin/dali_tf_plugin_utils.py
@@ -17,7 +17,7 @@
 import re
 import sys
 import fnmatch
-from distutils.version import StrictVersion
+from packaging.version import Version
 
 # Find file matching `pattern` in `path`
 
@@ -66,7 +66,7 @@ def get_tf_compiler_version():
         res = re.search(r"GCC:\s*\(.*\)\s*(\d+.\d+).\d+", line)
         if res:
             ver = res.group(1)
-            if not ret_ver or StrictVersion(ret_ver) < StrictVersion(ver):
+            if not ret_ver or Version(ret_ver) < Version(ver):
                 ret_ver = ver
     return ret_ver
 
diff --git a/dali_tf_plugin/setup.py.in b/dali_tf_plugin/setup.py.in
index f614ea8e8a..112ef084c0 100644
--- a/dali_tf_plugin/setup.py.in
+++ b/dali_tf_plugin/setup.py.in
@@ -88,7 +88,8 @@ For more details please check the
           'Programming Language :: Python :: 3.12',
           ],
       install_requires = [
-          'nvidia-dali@DALI_FLAVOR_MINUS@-cuda@CUDA_VERSION_SHORT_DIGIT_ONLY@==@DALI_VERSION@'
+          'nvidia-dali@DALI_FLAVOR_MINUS@-cuda@CUDA_VERSION_SHORT_DIGIT_ONLY@==@DALI_VERSION@',
+          'packaging',
           ],
 
       cmdclass={
diff --git a/docker/Dockerfile.customopbuilder.clean b/docker/Dockerfile.customopbuilder.clean
index 1683a2a9eb..0b3f42eae8 100644
--- a/docker/Dockerfile.customopbuilder.clean
+++ b/docker/Dockerfile.customopbuilder.clean
@@ -100,6 +100,7 @@ RUN rm -f /usr/bin/python && \
         rm get-pip.py; \
     fi && \
     pip install --upgrade pip && \
+    pip install packaging && \
     python --version && \
     pip --version
 
diff --git a/docs/examples/use_cases/paddle/resnet50/utils/config.py b/docs/examples/use_cases/paddle/resnet50/utils/config.py
index c77ea7422c..a88aca56d5 100644
--- a/docs/examples/use_cases/paddle/resnet50/utils/config.py
+++ b/docs/examples/use_cases/paddle/resnet50/utils/config.py
@@ -13,16 +13,37 @@
 # limitations under the License.
 
 import os
+import sys
 import copy
 import argparse
 import logging
-import distutils.util
 import dllogger
 from utils.mode import RunScope
 from utils.utility import get_num_trainers
 from utils.save_load import _PDOPT_SUFFIX, _PDPARAMS_SUFFIX
 
-_AUTO_LAST_EPOCH = 'auto'
+_AUTO_LAST_EPOCH = "auto"
+
+_true_set = {'yes', 'true', 't', 'y', '1'}
+_false_set = {'no', 'false', 'f', 'n', '0'}
+
+# based on https://github.com/symonsoft/str2bool/tree/master
+# BSD3 license
+def str2bool(value, raise_exc=False):
+    if (
+        isinstance(value, str)
+        or sys.version_info[0] < 3
+        and isinstance(value, basestring)
+    ):
+        value = value.lower()
+        if value in _true_set:
+            return True
+        if value in _false_set:
+            return False
+
+    if raise_exc:
+        raise ValueError('Expected "%s"' % '", "'.join(_true_set | _false_set))
+    return None
 
 
 def _get_full_path_of_ckpt(args):
@@ -38,16 +59,18 @@ def _check_file_exist(path_with_prefix):
             found = True
         return found, pdopt_path, pdparams_path
 
-    target_from_checkpoint = os.path.join(args.from_checkpoint,
-                                          args.model_prefix)
+    target_from_checkpoint = os.path.join(
+        args.from_checkpoint, args.model_prefix
+    )
     if args.last_epoch_of_checkpoint is None:
         args.last_epoch_of_checkpoint = -1
     elif args.last_epoch_of_checkpoint == _AUTO_LAST_EPOCH:
         folders = os.listdir(args.from_checkpoint)
         args.last_epoch_of_checkpoint = -1
         for folder in folders:
-            tmp_ckpt_path = os.path.join(args.from_checkpoint, folder,
-                                         args.model_prefix)
+            tmp_ckpt_path = os.path.join(
+                args.from_checkpoint, folder, args.model_prefix
+            )
 
             try:
                 folder = int(folder)
@@ -57,19 +80,27 @@ def _check_file_exist(path_with_prefix):
                 )
                 continue
 
-            if folder > args.last_epoch_of_checkpoint and \
-               _check_file_exist(tmp_ckpt_path)[0]:
+            if (
+                folder > args.last_epoch_of_checkpoint
+                and _check_file_exist(tmp_ckpt_path)[0]
+            ):
                 args.last_epoch_of_checkpoint = folder
-        epoch_with_prefix = os.path.join(str(args.last_epoch_of_checkpoint), args.model_prefix) \
-                            if args.last_epoch_of_checkpoint > -1 else args.model_prefix
-        target_from_checkpoint = os.path.join(args.from_checkpoint,
-                                              epoch_with_prefix)
+        epoch_with_prefix = (
+            os.path.join(str(args.last_epoch_of_checkpoint), args.model_prefix)
+            if args.last_epoch_of_checkpoint > -1
+            else args.model_prefix
+        )
+        target_from_checkpoint = os.path.join(
+            args.from_checkpoint, epoch_with_prefix
+        )
     else:
         try:
             args.last_epoch_of_checkpoint = int(args.last_epoch_of_checkpoint)
         except ValueError:
-            raise ValueError(f"The value of --last-epoch-of-checkpoint should be None, {_AUTO_LAST_EPOCH}"  \
-                            f" or integer >= 0, but receive {args.last_epoch_of_checkpoint}")
+            raise ValueError(
+                f"The value of --last-epoch-of-checkpoint should be None, {_AUTO_LAST_EPOCH}"
+                f" or integer >= 0, but receive {args.last_epoch_of_checkpoint}"
+            )
 
     args.from_checkpoint = target_from_checkpoint
     found, pdopt_path, pdparams_path = _check_file_exist(args.from_checkpoint)
@@ -86,13 +117,15 @@ def _get_full_path_of_pretrained_params(args):
         args.last_epoch_of_checkpoint = -1
         return
 
-    args.from_pretrained_params = os.path.join(args.from_pretrained_params,
-                                               args.model_prefix)
+    args.from_pretrained_params = os.path.join(
+        args.from_pretrained_params, args.model_prefix
+    )
     pdparams_path = args.from_pretrained_params + _PDPARAMS_SUFFIX
     if not os.path.exists(pdparams_path):
         args.from_pretrained_params = None
         logging.warning(
-            f"Cannot find {pdparams_path}, disable --from-pretrained-params.")
+            f"Cannot find {pdparams_path}, disable --from-pretrained-params."
+        )
     args.last_epoch_of_checkpoint = -1
 
 
@@ -102,7 +135,7 @@ def print_args(args):
     # Due to dllogger cannot serialize Enum into JSON.
     args_for_log.run_scope = args_for_log.run_scope.value
 
-    dllogger.log(step='PARAMETER', data=vars(args_for_log))
+    dllogger.log(step="PARAMETER", data=vars(args_for_log))
 
 
 def check_and_process_args(args):
@@ -112,25 +145,31 @@ def check_and_process_args(args):
         if args.run_scope == scope.value:
             run_scope = scope
             break
-    assert run_scope is not None, \
-           f"only support {[scope.value for scope in RunScope]} as run_scope"
+    assert (
+        run_scope is not None
+    ), f"only support {[scope.value for scope in RunScope]} as run_scope"
     args.run_scope = run_scope
 
     # Precess image layout and channel
     args.image_channel = args.image_shape[0]
     if args.data_layout == "NHWC":
         args.image_shape = [
-            args.image_shape[1], args.image_shape[2], args.image_shape[0]
+            args.image_shape[1],
+            args.image_shape[2],
+            args.image_shape[0],
         ]
 
     # Precess learning rate
     args.lr = get_num_trainers() * args.lr
 
     # Precess model loading
-    assert not (args.from_checkpoint is not None and \
-                args.from_pretrained_params is not None), \
-           "--from-pretrained-params and --from-checkpoint should " \
-           "not be set simultaneously."
+    assert not (
+        args.from_checkpoint is not None
+        and args.from_pretrained_params is not None
+    ), (
+        "--from-pretrained-params and --from-checkpoint should "
+        "not be set simultaneously."
+    )
     _get_full_path_of_pretrained_params(args)
     _get_full_path_of_ckpt(args)
     args.start_epoch = args.last_epoch_of_checkpoint + 1
@@ -138,12 +177,12 @@ def check_and_process_args(args):
     # Precess benchmark
     if args.benchmark:
         assert args.run_scope in [
-            RunScope.TRAIN_ONLY, RunScope.EVAL_ONLY
+            RunScope.TRAIN_ONLY,
+            RunScope.EVAL_ONLY,
         ], "If benchmark enabled, run_scope must be `train_only` or `eval_only`"
 
     # Only run one epoch when benchmark or eval_only.
-    if args.benchmark or \
-      (args.run_scope == RunScope.EVAL_ONLY):
+    if args.benchmark or (args.run_scope == RunScope.EVAL_ONLY):
         args.epochs = args.start_epoch + 1
 
     if args.run_scope == RunScope.EVAL_ONLY:
@@ -151,366 +190,410 @@ def check_and_process_args(args):
 
 
 def add_global_args(parser):
-    group = parser.add_argument_group('Global')
+    group = parser.add_argument_group("Global")
     group.add_argument(
-        '--output-dir',
+        "--output-dir",
         type=str,
-        default='./output/',
-        help='A path to store trained models.')
+        default="./output/",
+        help="A path to store trained models.",
+    )
     group.add_argument(
-        '--run-scope',
-        default='train_eval',
-        choices=('train_eval', 'train_only', 'eval_only'),
-        help='Running scope. It should be one of {train_eval, train_only, eval_only}.'
+        "--run-scope",
+        default="train_eval",
+        choices=("train_eval", "train_only", "eval_only"),
+        help="Running scope. It should be one of {train_eval, train_only, eval_only}.",
     )
     group.add_argument(
-        '--epochs',
+        "--epochs",
         type=int,
         default=90,
-        help='The number of epochs for training.')
+        help="The number of epochs for training.",
+    )
     group.add_argument(
-        '--save-interval',
+        "--save-interval",
         type=int,
         default=1,
-        help='The iteration interval to save checkpoints.')
+        help="The iteration interval to save checkpoints.",
+    )
     group.add_argument(
-        '--eval-interval',
+        "--eval-interval",
         type=int,
         default=1,
-        help='The iteration interval to test trained models on a given validation dataset. ' \
-             'Ignored when --run-scope is train_only.'
+        help="The iteration interval to test trained models on a given validation dataset. "
+        "Ignored when --run-scope is train_only.",
     )
     group.add_argument(
-        '--print-interval',
+        "--print-interval",
         type=int,
         default=10,
-        help='The iteration interval to show training/evaluation message.')
+        help="The iteration interval to show training/evaluation message.",
+    )
     group.add_argument(
-        '--report-file',
+        "--report-file",
         type=str,
-        default='./report.json',
-        help='A file in which to store JSON experiment report.')
+        default="./report.json",
+        help="A file in which to store JSON experiment report.",
+    )
     group.add_argument(
-        '--data-layout',
-        default='NCHW',
-        choices=('NCHW', 'NHWC'),
-        help='Data format. It should be one of {NCHW, NHWC}.')
+        "--data-layout",
+        default="NCHW",
+        choices=("NCHW", "NHWC"),
+        help="Data format. It should be one of {NCHW, NHWC}.",
+    )
     group.add_argument(
-        '--benchmark', action='store_true', help='To enable benchmark mode.')
+        "--benchmark", action="store_true", help="To enable benchmark mode."
+    )
     group.add_argument(
-        '--benchmark-steps',
+        "--benchmark-steps",
         type=int,
         default=100,
-        help='Steps for benchmark run, only be applied when --benchmark is set.'
+        help="Steps for benchmark run, only be applied when --benchmark is set.",
     )
     group.add_argument(
-        '--benchmark-warmup-steps',
+        "--benchmark-warmup-steps",
         type=int,
         default=100,
-        help='Warmup steps for benchmark run, only be applied when --benchmark is set.'
+        help="Warmup steps for benchmark run, only be applied when --benchmark is set.",
     )
     group.add_argument(
-        '--model-prefix',
+        "--model-prefix",
         type=str,
         default="resnet_50_paddle",
-        help='The prefix name of model files to save/load.')
+        help="The prefix name of model files to save/load.",
+    )
     group.add_argument(
-        '--from-pretrained-params',
+        "--from-pretrained-params",
         type=str,
         default=None,
-        help='A folder path which contains pretrained parameters, that is a file in name' \
-             ' --model-prefix + .pdparams. It should not be set with --from-checkpoint' \
-             ' at the same time.'
+        help="A folder path which contains pretrained parameters, that is a file in name"
+        " --model-prefix + .pdparams. It should not be set with --from-checkpoint"
+        " at the same time.",
     )
     group.add_argument(
-        '--from-checkpoint',
+        "--from-checkpoint",
         type=str,
         default=None,
-        help='A checkpoint path to resume training. It should not be set ' \
-             'with --from-pretrained-params at the same time. The path provided ' \
-             'could be a folder contains < epoch_id/ckpt_files > or < ckpt_files >.'
+        help="A checkpoint path to resume training. It should not be set "
+        "with --from-pretrained-params at the same time. The path provided "
+        "could be a folder contains < epoch_id/ckpt_files > or < ckpt_files >.",
     )
     group.add_argument(
-        '--last-epoch-of-checkpoint',
+        "--last-epoch-of-checkpoint",
         type=str,
         default=None,
-        help='The epoch id of the checkpoint given by --from-checkpoint. ' \
-             'It should be None, auto or integer >= 0. If it is set as ' \
-             'None, then training will start from 0-th epoch. If it is set as ' \
-             'auto, then it will search largest integer-convertable folder ' \
-             ' --from-checkpoint, which contains required checkpoint. ' \
-             'Default is None.'
+        help="The epoch id of the checkpoint given by --from-checkpoint. "
+        "It should be None, auto or integer >= 0. If it is set as "
+        "None, then training will start from 0-th epoch. If it is set as "
+        "auto, then it will search largest integer-convertable folder "
+        " --from-checkpoint, which contains required checkpoint. "
+        "Default is None.",
     )
     group.add_argument(
-        '--show-config',
-        type=distutils.util.strtobool,
+        "--show-config",
+        type=str2bool,
         default=True,
-        help='To show arguments.')
+        help="To show arguments.",
+    )
     group.add_argument(
-        '--enable-cpu-affinity',
-        type=distutils.util.strtobool,
+        "--enable-cpu-affinity",
+        type=str2bool,
         default=True,
-        help='To enable in-built GPU-CPU affinity.')
+        help="To enable in-built GPU-CPU affinity.",
+    )
     return parser
 
 
 def add_advance_args(parser):
-    group = parser.add_argument_group('Advanced Training')
+    group = parser.add_argument_group("Advanced Training")
     # AMP
     group.add_argument(
-        '--amp',
-        action='store_true',
-        help='Enable automatic mixed precision training (AMP).')
+        "--amp",
+        action="store_true",
+        help="Enable automatic mixed precision training (AMP).",
+    )
     group.add_argument(
-        '--scale-loss',
+        "--scale-loss",
         type=float,
         default=1.0,
-        help='The loss scalar for AMP training, only be applied when --amp is set.'
+        help="The loss scalar for AMP training, only be applied when --amp is set.",
     )
     group.add_argument(
-        '--use-dynamic-loss-scaling',
-        action='store_true',
-        help='Enable dynamic loss scaling in AMP training, only be applied when --amp is set.'
+        "--use-dynamic-loss-scaling",
+        action="store_true",
+        help="Enable dynamic loss scaling in AMP training, only be applied when --amp is set.",
     )
     group.add_argument(
-        '--use-pure-fp16',
-        action='store_true',
-        help='Enable pure FP16 training, only be applied when --amp is set.')
+        "--use-pure-fp16",
+        action="store_true",
+        help="Enable pure FP16 training, only be applied when --amp is set.",
+    )
     group.add_argument(
-        '--fuse-resunit',
-        action='store_true',
-        help='Enable CUDNNv8 ResUnit fusion, only be applied when --amp is set.')
+        "--fuse-resunit",
+        action="store_true",
+        help="Enable CUDNNv8 ResUnit fusion, only be applied when --amp is set.",
+    )
     # ASP
     group.add_argument(
-        '--asp',
-        action='store_true',
-        help='Enable automatic sparse training (ASP).')
+        "--asp",
+        action="store_true",
+        help="Enable automatic sparse training (ASP).",
+    )
     group.add_argument(
-        '--prune-model',
-        action='store_true',
-        help='Prune model to 2:4 sparse pattern, only be applied when --asp is set.'
+        "--prune-model",
+        action="store_true",
+        help="Prune model to 2:4 sparse pattern, only be applied when --asp is set.",
     )
     group.add_argument(
-        '--mask-algo',
-        default='mask_1d',
-        choices=('mask_1d', 'mask_2d_greedy', 'mask_2d_best'),
-        help='The algorithm to generate sparse masks. It should be one of ' \
-             '{mask_1d, mask_2d_greedy, mask_2d_best}. This only be applied ' \
-             'when --asp and --prune-model is set.'
+        "--mask-algo",
+        default="mask_1d",
+        choices=("mask_1d", "mask_2d_greedy", "mask_2d_best"),
+        help="The algorithm to generate sparse masks. It should be one of "
+        "{mask_1d, mask_2d_greedy, mask_2d_best}. This only be applied "
+        "when --asp and --prune-model is set.",
     )
     return parser
 
 
 def add_dataset_args(parser):
     def float_list(x):
-        return list(map(float, x.split(',')))
+        return list(map(float, x.split(",")))
 
     def int_list(x):
-        return list(map(int, x.split(',')))
+        return list(map(int, x.split(",")))
 
-    dataset_group = parser.add_argument_group('Dataset')
+    dataset_group = parser.add_argument_group("Dataset")
     dataset_group.add_argument(
-        '--image-root',
+        "--image-root",
         type=str,
-        default='/imagenet',
-        help='A root folder of train/val images. It should contain train and val folders, ' \
-             'which store corresponding images.'
+        default="/imagenet",
+        help="A root folder of train/val images. It should contain train and val folders, "
+        "which store corresponding images.",
     )
     dataset_group.add_argument(
-        '--image-shape',
+        "--image-shape",
         type=int_list,
         default=[4, 224, 224],
-        help='The image shape. Its shape should be [channel, height, width].')
+        help="The image shape. Its shape should be [channel, height, width].",
+    )
 
     # Data Loader
     dataset_group.add_argument(
-        '--batch-size',
+        "--batch-size",
         type=int,
         default=256,
-        help='The batch size for both training and evaluation.')
+        help="The batch size for both training and evaluation.",
+    )
     dataset_group.add_argument(
-        '--dali-random-seed',
+        "--dali-random-seed",
         type=int,
         default=42,
-        help='The random seed for DALI data loader.')
+        help="The random seed for DALI data loader.",
+    )
     dataset_group.add_argument(
-        '--dali-num-threads',
+        "--dali-num-threads",
         type=int,
         default=4,
-        help='The number of threads applied to DALI data loader.')
+        help="The number of threads applied to DALI data loader.",
+    )
     dataset_group.add_argument(
-        '--dali-output-fp16',
-        action='store_true',
-        help='Output FP16 data from DALI data loader.')
+        "--dali-output-fp16",
+        action="store_true",
+        help="Output FP16 data from DALI data loader.",
+    )
 
     # Augmentation
-    augmentation_group = parser.add_argument_group('Data Augmentation')
+    augmentation_group = parser.add_argument_group("Data Augmentation")
     augmentation_group.add_argument(
-        '--crop-size',
+        "--crop-size",
         type=int,
         default=224,
-        help='The size to crop input images.')
+        help="The size to crop input images.",
+    )
     augmentation_group.add_argument(
-        '--rand-crop-scale',
+        "--rand-crop-scale",
         type=float_list,
-        default=[0.08, 1.],
-        help='Range from which to choose a random area fraction.')
+        default=[0.08, 1.0],
+        help="Range from which to choose a random area fraction.",
+    )
     augmentation_group.add_argument(
-        '--rand-crop-ratio',
+        "--rand-crop-ratio",
         type=float_list,
         default=[3.0 / 4, 4.0 / 3],
-        help='Range from which to choose a random aspect ratio (width/height).')
+        help="Range from which to choose a random aspect ratio (width/height).",
+    )
     augmentation_group.add_argument(
-        '--normalize-scale',
+        "--normalize-scale",
         type=float,
         default=1.0 / 255.0,
-        help='A scalar to normalize images.')
+        help="A scalar to normalize images.",
+    )
     augmentation_group.add_argument(
-        '--normalize-mean',
+        "--normalize-mean",
         type=float_list,
         default=[0.485, 0.456, 0.406],
-        help='The mean values to normalize RGB images.')
+        help="The mean values to normalize RGB images.",
+    )
     augmentation_group.add_argument(
-        '--normalize-std',
+        "--normalize-std",
         type=float_list,
         default=[0.229, 0.224, 0.225],
-        help='The std values to normalize RGB images.')
+        help="The std values to normalize RGB images.",
+    )
     augmentation_group.add_argument(
-        '--resize-short',
+        "--resize-short",
         type=int,
         default=256,
-        help='The length of the shorter dimension of the resized image.')
+        help="The length of the shorter dimension of the resized image.",
+    )
     return parser
 
 
 def add_model_args(parser):
-    group = parser.add_argument_group('Model')
+    group = parser.add_argument_group("Model")
     group.add_argument(
-        '--model-arch-name',
+        "--model-arch-name",
         type=str,
-        default='ResNet50',
-        help='The model architecture name. It should be one of {ResNet50}.')
+        default="ResNet50",
+        help="The model architecture name. It should be one of {ResNet50}.",
+    )
     group.add_argument(
-        '--num-of-class',
+        "--num-of-class",
         type=int,
         default=1000,
-        help='The number classes of images.')
+        help="The number classes of images.",
+    )
     group.add_argument(
-        '--bn-weight-decay',
-        action='store_true',
-        help='Apply weight decay to BatchNorm shift and scale.')
+        "--bn-weight-decay",
+        action="store_true",
+        help="Apply weight decay to BatchNorm shift and scale.",
+    )
     return parser
 
 
 def add_training_args(parser):
-    group = parser.add_argument_group('Training')
+    group = parser.add_argument_group("Training")
     group.add_argument(
-        '--label-smoothing',
+        "--label-smoothing",
         type=float,
         default=0.1,
-        help='The ratio of label smoothing.')
+        help="The ratio of label smoothing.",
+    )
     group.add_argument(
-        '--optimizer',
-        default='Momentum',
+        "--optimizer",
+        default="Momentum",
         metavar="OPTIMIZER",
-        choices=('Momentum'),
-        help='The name of optimizer. It should be one of {Momentum}.')
+        choices=("Momentum"),
+        help="The name of optimizer. It should be one of {Momentum}.",
+    )
     group.add_argument(
-        '--momentum',
+        "--momentum",
         type=float,
         default=0.875,
-        help='The momentum value of optimizer.')
+        help="The momentum value of optimizer.",
+    )
     group.add_argument(
-        '--weight-decay',
+        "--weight-decay",
         type=float,
         default=3.0517578125e-05,
-        help='The coefficient of weight decay.')
+        help="The coefficient of weight decay.",
+    )
     group.add_argument(
-        '--lr-scheduler',
-        default='Cosine',
+        "--lr-scheduler",
+        default="Cosine",
         metavar="LR_SCHEDULER",
-        choices=('Cosine'),
-        help='The name of learning rate scheduler. It should be one of {Cosine}.'
+        choices=("Cosine"),
+        help="The name of learning rate scheduler. It should be one of {Cosine}.",
     )
     group.add_argument(
-        '--lr', type=float, default=0.256, help='The initial learning rate.')
+        "--lr", type=float, default=0.256, help="The initial learning rate."
+    )
     group.add_argument(
-        '--warmup-epochs',
+        "--warmup-epochs",
         type=int,
         default=5,
-        help='The number of epochs for learning rate warmup.')
+        help="The number of epochs for learning rate warmup.",
+    )
     group.add_argument(
-        '--warmup-start-lr',
+        "--warmup-start-lr",
         type=float,
         default=0.0,
-        help='The initial learning rate for warmup.')
+        help="The initial learning rate for warmup.",
+    )
     return parser
 
 
 def add_trt_args(parser):
-    group = parser.add_argument_group('Paddle-TRT')
+    group = parser.add_argument_group("Paddle-TRT")
     group.add_argument(
-        '--device',
+        "--device",
         type=int,
-        default='0',
-        help='The GPU device id for Paddle-TRT inference.'
+        default="0",
+        help="The GPU device id for Paddle-TRT inference.",
     )
     group.add_argument(
-        '--trt-inference-dir',
+        "--trt-inference-dir",
         type=str,
-        default='./inference',
-        help='A path to store/load inference models. ' \
-             'export_model.py would export models to this folder, ' \
-             'then inference.py would load from here.'
+        default="./inference",
+        help="A path to store/load inference models. "
+        "export_model.py would export models to this folder, "
+        "then inference.py would load from here.",
     )
     group.add_argument(
-        '--trt-precision',
-        default='FP32',
-        choices=('FP32', 'FP16', 'INT8'),
-        help='The precision of TensorRT. It should be one of {FP32, FP16, INT8}.'
+        "--trt-precision",
+        default="FP32",
+        choices=("FP32", "FP16", "INT8"),
+        help="The precision of TensorRT. It should be one of {FP32, FP16, INT8}.",
     )
     group.add_argument(
-        '--trt-workspace-size',
+        "--trt-workspace-size",
         type=int,
         default=(1 << 30),
-        help='The memory workspace of TensorRT in MB.')
+        help="The memory workspace of TensorRT in MB.",
+    )
     group.add_argument(
-        '--trt-min-subgraph-size',
+        "--trt-min-subgraph-size",
         type=int,
         default=3,
-        help='The minimal subgraph size to enable PaddleTRT.')
+        help="The minimal subgraph size to enable PaddleTRT.",
+    )
     group.add_argument(
-        '--trt-use-static',
-        type=distutils.util.strtobool,
+        "--trt-use-static",
+        type=str2bool,
         default=False,
-        help='Fix TensorRT engine at first running.')
+        help="Fix TensorRT engine at first running.",
+    )
     group.add_argument(
-        '--trt-use-calib-mode',
-        type=distutils.util.strtobool,
+        "--trt-use-calib-mode",
+        type=str2bool,
         default=False,
-        help='Use the PTQ calibration of PaddleTRT int8.')
+        help="Use the PTQ calibration of PaddleTRT int8.",
+    )
     group.add_argument(
-        '--trt-export-log-path',
+        "--trt-export-log-path",
         type=str,
-        default='./export.json',
-        help='A file in which to store JSON model exporting report.')
+        default="./export.json",
+        help="A file in which to store JSON model exporting report.",
+    )
     group.add_argument(
-        '--trt-log-path',
+        "--trt-log-path",
         type=str,
-        default='./inference.json',
-        help='A file in which to store JSON inference report.')
+        default="./inference.json",
+        help="A file in which to store JSON inference report.",
+    )
     group.add_argument(
-        '--trt-use-synthetic',
-        type=distutils.util.strtobool,
+        "--trt-use-synthetic",
+        type=str2bool,
         default=False,
-        help='Apply synthetic data for benchmark.')
+        help="Apply synthetic data for benchmark.",
+    )
     return parser
 
 
 def parse_args(including_trt=False):
     parser = argparse.ArgumentParser(
         description="PaddlePaddle RN50v1.5 training script",
-        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+    )
     parser = add_global_args(parser)
     parser = add_dataset_args(parser)
     parser = add_model_args(parser)
diff --git a/docs/examples/use_cases/tensorflow/resnet-n/nvutils/common.py b/docs/examples/use_cases/tensorflow/resnet-n/nvutils/common.py
index 55eb620abd..a36c753e2e 100644
--- a/docs/examples/use_cases/tensorflow/resnet-n/nvutils/common.py
+++ b/docs/examples/use_cases/tensorflow/resnet-n/nvutils/common.py
@@ -1,5 +1,5 @@
 import tensorflow as tf
-from distutils.version import StrictVersion
+from packaging.version import Version
 
 BASE_LEARNING_RATE = 0.1
 
@@ -18,7 +18,7 @@ def create_piecewise_constant_decay_with_warmup(batch_size, epoch_size,
   rescaled_lr = BASE_LEARNING_RATE * batch_size / base_lr_batch_size
   step_boundaries = [float(steps_per_epoch) * x for x in boundaries]
   lr_values = [rescaled_lr * m for m in multipliers]
-  if StrictVersion(tf.__version__) >= StrictVersion("2.13"):
+  if Version(tf.__version__) >= Version("2.13"):
     warmup_steps = int(warmup_epochs * steps_per_epoch)
   else:
     warmup_steps = warmup_epochs * steps_per_epoch
@@ -38,7 +38,7 @@ def __init__(self, rescaled_lr, step_boundaries, lr_values, warmup_steps,
     super(PiecewiseConstantDecayWithWarmup, self).__init__()
 
     self.rescaled_lr = rescaled_lr
-    if StrictVersion(tf.__version__) >= StrictVersion("2.13"):
+    if Version(tf.__version__) >= Version("2.13"):
       self.step_boundaries = [int(b) for b in step_boundaries]
     else:
       self.step_boundaries = step_boundaries
diff --git a/docs/examples/use_cases/tensorflow/resnet-n/nvutils/hvd_patch.py b/docs/examples/use_cases/tensorflow/resnet-n/nvutils/hvd_patch.py
index 4dca4f6577..5f328eff4d 100644
--- a/docs/examples/use_cases/tensorflow/resnet-n/nvutils/hvd_patch.py
+++ b/docs/examples/use_cases/tensorflow/resnet-n/nvutils/hvd_patch.py
@@ -8,10 +8,10 @@
 import horovod.tensorflow as hvd
 import tensorflow as tf
 from nvutils import common
-from distutils.version import LooseVersion
+from packaging.version import Version
 from horovod.tensorflow import Average, Compression, Sum
 
-_PRE_TF_2_4_0 = LooseVersion(tf.__version__) < LooseVersion('2.4.0')
+_PRE_TF_2_4_0 = Version(tf.__version__) < Version('2.4.0')
 
 def create_distributed_optimizer(
         keras, optimizer, name, device_dense, device_sparse, compression,
diff --git a/docs/examples/use_cases/tensorflow/resnet-n/nvutils/runner.py b/docs/examples/use_cases/tensorflow/resnet-n/nvutils/runner.py
index a53034dbe0..91af31bd94 100755
--- a/docs/examples/use_cases/tensorflow/resnet-n/nvutils/runner.py
+++ b/docs/examples/use_cases/tensorflow/resnet-n/nvutils/runner.py
@@ -16,7 +16,7 @@
 
 from nvutils import image_processing
 from nvutils import common
-from distutils.version import StrictVersion
+from packaging.version import Version
 
 import tensorflow as tf
 from tensorflow import keras
@@ -27,8 +27,8 @@
 
 from keras import backend
 print(tf.__version__)
-if StrictVersion(tf.__version__) > StrictVersion("2.1.0"):
-  if StrictVersion(tf.__version__) >= StrictVersion("2.4.0"):
+if Version(tf.__version__) > Version("2.1.0"):
+  if Version(tf.__version__) >= Version("2.4.0"):
     from tensorflow.python.keras.mixed_precision import device_compatibility_check
   else:
     from tensorflow.python.keras.mixed_precision.experimental import device_compatibility_check
@@ -142,7 +142,7 @@ def train(model_func, params):
     tf.config.experimental.set_visible_devices(gpus[hvd.local_rank()], 'GPU')
 
   if precision == 'fp16':
-    if StrictVersion(tf.__version__) >= StrictVersion("2.4.0"):
+    if Version(tf.__version__) >= Version("2.4.0"):
       policy = keras.mixed_precision.Policy('mixed_float16')
       keras.mixed_precision.set_global_policy(policy)
     else:
@@ -160,7 +160,7 @@ def train(model_func, params):
   # Horovod: add Horovod DistributedOptimizer. We use a modified version to
   # support the custom learning rate schedule.
   opt = hvd.DistributedOptimizer(opt)
-  if StrictVersion(tf.__version__) >= StrictVersion("2.4.0") and precision == 'fp16':
+  if Version(tf.__version__) >= Version("2.4.0") and precision == 'fp16':
     opt = keras.mixed_precision.LossScaleOptimizer(opt, dynamic=False,
                                                    initial_scale=loss_scale)
 
diff --git a/docs/examples/use_cases/tensorflow/resnet-n/nvutils/runner_ctl.py b/docs/examples/use_cases/tensorflow/resnet-n/nvutils/runner_ctl.py
index bcdfa22bf4..cf4f9266d2 100755
--- a/docs/examples/use_cases/tensorflow/resnet-n/nvutils/runner_ctl.py
+++ b/docs/examples/use_cases/tensorflow/resnet-n/nvutils/runner_ctl.py
@@ -17,7 +17,7 @@
 from builtins import range
 from nvutils import image_processing
 from nvutils import common
-from distutils.version import StrictVersion
+from packaging.version import Version
 
 import tensorflow as tf
 from tensorflow import keras
@@ -27,8 +27,8 @@
 
 from keras import backend
 print(tf.__version__)
-if StrictVersion(tf.__version__) > StrictVersion("2.1.0"):
-  if StrictVersion(tf.__version__) >= StrictVersion("2.4.0"):
+if Version(tf.__version__) > Version("2.1.0"):
+  if Version(tf.__version__) >= Version("2.4.0"):
     from tensorflow.python.keras.mixed_precision import device_compatibility_check
   else:
     from tensorflow.python.keras.mixed_precision.experimental import device_compatibility_check
@@ -105,7 +105,7 @@ def train_ctl(model_func, params):
     summary_writer = None
 
   if precision == 'fp16':
-    if StrictVersion(tf.__version__) >= StrictVersion("2.4.0"):
+    if Version(tf.__version__) >= Version("2.4.0"):
       policy = keras.mixed_precision.Policy('mixed_float16')
       keras.mixed_precision.set_global_policy(policy)
     else:
diff --git a/qa/TL1_tensorflow_dataset/test_impl.sh b/qa/TL1_tensorflow_dataset/test_impl.sh
index d3bec26540..aae4b015fa 100755
--- a/qa/TL1_tensorflow_dataset/test_impl.sh
+++ b/qa/TL1_tensorflow_dataset/test_impl.sh
@@ -25,8 +25,8 @@ test_body() {
         pushd ../../../docs/examples/frameworks/tensorflow/
         # TF 2.16 removed usage of tf.estimator the test uses
         is_below_2_16=$(python -c 'import tensorflow as tf; \
-                                   from distutils.version import StrictVersion; \
-                                   print(StrictVersion(tf.__version__) < StrictVersion("2.16"))')
+                                   from packaging.version import Version; \
+                                   print(Version(tf.__version__) < Version("2.16"))')
 
         if [ $is_below_2_16 = 'True' ]; then
             jupyter nbconvert tensorflow-dataset.ipynb \
@@ -39,9 +39,9 @@ test_body() {
         # TensorFlow 2.12 test_keras_multi_gpu_mirrored_strategy doesn't work.
         is_compatible_distributed=$(python -c 'import nvidia.dali.plugin.tf as dali_tf; \
                                                import tensorflow as tf; \
-                                               from distutils.version import LooseVersion; \
+                                               from packaging.version import Version; \
                                                print(dali_tf.dataset_distributed_compatible_tensorflow() \
-                                               and LooseVersion(tf.__version__) < LooseVersion("2.12.0"))')
+                                               and Version(tf.__version__) < Version("2.12.0"))')
         if [ $is_compatible_distributed = 'True' ]; then
             jupyter nbconvert tensorflow-dataset-multigpu.ipynb \
                     --to notebook --inplace --execute \

From a093d74146c1f1bacef47e58501348acede2ff46 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Micha=C5=82=20Zientkiewicz?= <michalz@nvidia.com>
Date: Mon, 28 Oct 2024 11:04:05 +0100
Subject: [PATCH 28/29] Add dynamic executor support to TF plugin. (#5686)

* Add dynamic executor support to TF plugin.
* Add tests that wouldn't work with legacy executor

---------

Signed-off-by: Michal Zientkiewicz <michalz@nvidia.com>
---
 dali/python/nvidia/dali/plugin/tf.py   |  19 +++-
 dali/test/python/test_dali_tf_exec2.py | 133 +++++++++++++++++++++++++
 dali_tf_plugin/dali_dataset.h          |   4 +-
 dali_tf_plugin/dali_dataset_op.cc      |  50 ++++++----
 dali_tf_plugin/daliop.cc               |  63 +++++++-----
 qa/TL0_tensorflow_plugin/test.sh       |   3 +
 6 files changed, 224 insertions(+), 48 deletions(-)
 create mode 100644 dali/test/python/test_dali_tf_exec2.py

diff --git a/dali/python/nvidia/dali/plugin/tf.py b/dali/python/nvidia/dali/plugin/tf.py
index 7ae887e79d..28260bf329 100644
--- a/dali/python/nvidia/dali/plugin/tf.py
+++ b/dali/python/nvidia/dali/plugin/tf.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2017-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2017-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -214,6 +214,7 @@ def DALIIteratorWrapper(
     dtypes=[],
     batch_size=-1,
     prefetch_queue_depth=2,
+    exec_dynamic=False,
     **kwargs,
 ):
     """
@@ -232,6 +233,9 @@ def DALIIteratorWrapper(
         cpu_prefetch_queue_depth = -1  # dummy: wont' be used
         gpu_prefetch_queue_depth = prefetch_queue_depth
 
+    if pipeline is not None and pipeline._exec_dynamic:
+        exec_dynamic = True
+
     if serialized_pipeline is None:
         serialized_pipeline = serialize_pipeline(pipeline)
 
@@ -281,6 +285,7 @@ def DALIIteratorWrapper(
         exec_separated=exec_separated,
         gpu_prefetch_queue_depth=gpu_prefetch_queue_depth,
         cpu_prefetch_queue_depth=cpu_prefetch_queue_depth,
+        exec_dynamic=exec_dynamic,
         **kwargs,
     )
     new_out = []
@@ -436,6 +441,7 @@ def __init__(
             num_threads=4,
             device_id=0,
             exec_separated=False,
+            exec_dynamic=False,
             prefetch_queue_depth=2,
             cpu_prefetch_queue_depth=2,
             gpu_prefetch_queue_depth=2,
@@ -445,6 +451,9 @@ def __init__(
             output_shapes = self._handle_deprecation(output_shapes, shapes, "shapes")
             output_dtypes = self._handle_deprecation(output_dtypes, dtypes, "dtypes")
 
+            if pipeline._exec_dynamic:
+                exec_dynamic = True
+
             if not self._check_dtypes(output_dtypes, tf.DType):
                 raise TypeError(
                     "`output_dtypes` should be provided as single tf.DType value "
@@ -475,6 +484,7 @@ def __init__(
                 device_id = types.CPU_ONLY_DEVICE_ID
             self._device_id = device_id
             self._exec_separated = exec_separated
+            self._exec_dynamic = exec_dynamic
             self._prefetch_queue_depth = prefetch_queue_depth
             self._cpu_prefetch_queue_depth = cpu_prefetch_queue_depth
             self._gpu_prefetch_queue_depth = gpu_prefetch_queue_depth
@@ -805,6 +815,7 @@ def _as_variant_tensor(self):
                 num_threads=self._num_threads,
                 device_id=self._device_id,
                 exec_separated=self._exec_separated,
+                exec_dynamic=self._exec_dynamic,
                 prefetch_queue_depth=self._prefetch_queue_depth,
                 cpu_prefetch_queue_depth=self._cpu_prefetch_queue_depth,
                 gpu_prefetch_queue_depth=self._gpu_prefetch_queue_depth,
@@ -865,6 +876,7 @@ def __init__(
             num_threads=4,
             device_id=0,
             exec_separated=False,
+            exec_dynamic=False,
             prefetch_queue_depth=2,
             cpu_prefetch_queue_depth=2,
             gpu_prefetch_queue_depth=2,
@@ -984,6 +996,11 @@ def __init__(self, *args, **kwargs):
         Whether to execute the pipeline in a way that enables
         overlapping CPU and GPU computation, typically resulting
         in faster execution speed, but larger memory consumption.
+        This flag is incompatible with ``exec_dymamic``.
+    exec_dynamic : bool, optional, default = False
+        Whether to execute the pipeline with the dynamic executor, which allows flexible mixing
+        of CPU and GPU operators and enables aggressive memory reuse.
+        This flag is incompatible with ``exec_separated``.
     prefetch_queue_depth : int, optional, default = 2
         depth of the executor queue. Deeper queue makes DALI more
         resistant to uneven execution time of each batch, but it also
diff --git a/dali/test/python/test_dali_tf_exec2.py b/dali/test/python/test_dali_tf_exec2.py
new file mode 100644
index 0000000000..ec83babdc3
--- /dev/null
+++ b/dali/test/python/test_dali_tf_exec2.py
@@ -0,0 +1,133 @@
+# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import tensorflow as tf
+import numpy as np
+import os.path
+from nvidia.dali import pipeline_def
+import nvidia.dali.fn as fn
+import nvidia.dali.types as types
+import nvidia.dali.plugin.tf as dali_tf
+from nose_utils import with_setup
+from test_utils_tensorflow import skip_inputs_for_incompatible_tf
+from test_utils import get_dali_extra_path
+
+
+test_data_root = get_dali_extra_path()
+lmdb_folder = os.path.join(test_data_root, "db", "lmdb")
+
+
+@pipeline_def(
+    enable_conditionals=True,
+    batch_size=5,
+    num_threads=4,
+    device_id=0,
+    experimental_exec_dynamic=True,
+)
+def dali_exec2_pipeline():
+    iter_id = fn.external_source(source=lambda x: np.array(x.iteration), batch=False)
+    if iter_id & 1 == 0:
+        output = types.Constant(np.array(-1), device="gpu")
+    else:
+        output = types.Constant(np.array(1), device="gpu")
+    return output.cpu()
+
+
+@with_setup(skip_inputs_for_incompatible_tf)
+def test_tf_dataset_exec2():
+    """Test that exec_dynamic is propagated to DALI pipeline from dali_tf.DALIDatasetWithInputs"""
+    # From Tensorflow's perspective, this is a CPU pipeline
+    with tf.device("/cpu:0"):
+        dali_dataset = dali_tf.experimental.DALIDatasetWithInputs(
+            pipeline=dali_exec2_pipeline(),
+            batch_size=5,
+            output_shapes=(5,),
+            output_dtypes=(tf.int32),
+            num_threads=4,
+            device_id=0,
+        )
+
+        @tf.function
+        def tf_function_with_conditionals(dali_dataset):
+            negative = tf.constant(0)
+            positive = tf.constant(0)
+            for input in dali_dataset:
+                if tf.reduce_sum(input) < 0:
+                    negative = negative + 1
+                else:
+                    positive = positive + 1
+            return negative, positive
+
+        pos, neg = tf_function_with_conditionals(dali_dataset.take(5))
+        assert pos == 3
+        assert neg == 2
+
+
+@pipeline_def(num_threads=4, experimental_exec_dynamic=True)
+def daliop_pipe():
+    jpegs, labels = fn.readers.caffe(path=lmdb_folder, random_shuffle=False)
+    imgs = fn.decoders.image(jpegs, device="mixed")
+    imgs = fn.resize(imgs, size=(100, 100))
+    shape = imgs.shape(dtype=types.UINT32)
+    return imgs.cpu(), shape
+
+
+def get_batch_dali(batch_size):
+    pipe = daliop_pipe(batch_size=batch_size, num_threads=4, device_id=0)
+    pipe.build()
+
+    daliop = dali_tf.DALIIterator()
+    images = []
+    labels = []
+    with tf.device("/cpu:0"):
+        image, label = daliop(
+            pipeline=pipe,
+            shapes=[
+                (batch_size, 100, 100, 3),
+                (
+                    batch_size,
+                    3,
+                ),
+            ],
+            dtypes=[tf.uint8, tf.int32],
+            device_id=0,
+        )
+        images.append(image)
+        labels.append(label)
+
+    return [images, labels]
+
+
+def test_tf_op():
+    """Test that exec_dynamic is propagated to DALI pipeline from dali_tf.DALIIterator"""
+    try:
+        tf.compat.v1.disable_eager_execution()
+    except ModuleNotFoundError:
+        pass
+
+    batch_size = 8
+    iterations = 2
+    test_batch = get_batch_dali(batch_size)
+    try:
+        from tensorflow.compat.v1 import Session
+    except ImportError:
+        # Older TF versions don't have compat.v1 layer
+        from tensorflow import Session
+
+    with Session() as sess:
+        for i in range(iterations):
+            imgs, shapes = sess.run(test_batch)
+            for img, shape in zip(imgs, shapes):
+                for i in range(batch_size):
+                    assert tuple(img[i].shape) == tuple(shape[i])
diff --git a/dali_tf_plugin/dali_dataset.h b/dali_tf_plugin/dali_dataset.h
index f4996f175a..ef0b4d7b1b 100644
--- a/dali_tf_plugin/dali_dataset.h
+++ b/dali_tf_plugin/dali_dataset.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// Copyright (c) 2021-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -70,6 +70,7 @@ class DALIDatasetOp : public tensorflow::data::DatasetOpKernel {
     int num_threads;
     int device_id;
     bool exec_separated;
+    bool exec_dynamic;
     int prefetch_queue_depth;
     int cpu_prefetch_queue_depth;
     int gpu_prefetch_queue_depth;
@@ -99,6 +100,7 @@ class DALIDatasetOp : public tensorflow::data::DatasetOpKernel {
   static constexpr const char* const kNumThreads = "num_threads";
   static constexpr const char* const kDeviceId = "device_id";
   static constexpr const char* const kExecSeparated = "exec_separated";
+  static constexpr const char* const kExecDynamic = "exec_dynamic";
   static constexpr const char* const kPrefetchQueueDepth = "prefetch_queue_depth";
   static constexpr const char* const kCpuPrefetchQueueDepth = "cpu_prefetch_queue_depth";
   static constexpr const char* const kGpuPrefetchQueueDepth = "gpu_prefetch_queue_depth";
diff --git a/dali_tf_plugin/dali_dataset_op.cc b/dali_tf_plugin/dali_dataset_op.cc
index 0c95a0dd2f..dbefe62ae6 100644
--- a/dali_tf_plugin/dali_dataset_op.cc
+++ b/dali_tf_plugin/dali_dataset_op.cc
@@ -220,6 +220,7 @@ class DALIDatasetOp::Dataset : public DatasetBase {
     SerializeField(attrs, b, kNumThreads, pipeline_def_.num_threads);
     SerializeField(attrs, b, kDeviceId, pipeline_def_.device_id);
     SerializeField(attrs, b, kExecSeparated, pipeline_def_.exec_separated);
+    SerializeField(attrs, b, kExecDynamic, pipeline_def_.exec_dynamic);
     SerializeField(attrs, b, kPrefetchQueueDepth, pipeline_def_.prefetch_queue_depth);
     SerializeField(attrs, b, kCpuPrefetchQueueDepth, pipeline_def_.cpu_prefetch_queue_depth);
     SerializeField(attrs, b, kGpuPrefetchQueueDepth, pipeline_def_.gpu_prefetch_queue_depth);
@@ -248,10 +249,15 @@ class DALIDatasetOp::Dataset : public DatasetBase {
   }
 
   Status InitPipeline(daliPipelineHandle *pipeline_handle) const {
-    TF_DALI_CALL(daliCreatePipeline(
+    dali_exec_flags_t flags = DALI_EXEC_ASYNC_PIPELINED;
+    if (pipeline_def_.exec_dynamic)
+      flags = flags | DALI_EXEC_IS_DYNAMIC;
+    if (pipeline_def_.exec_separated)
+      flags = flags | DALI_EXEC_IS_SEPARATED;
+    TF_DALI_CALL(daliCreatePipeline3(
         pipeline_handle, pipeline_def_.pipeline.c_str(), pipeline_def_.pipeline.length(),
         pipeline_def_.batch_size, pipeline_def_.num_threads, pipeline_def_.device_id,
-        pipeline_def_.exec_separated, pipeline_def_.prefetch_queue_depth,
+        flags, pipeline_def_.prefetch_queue_depth,
         pipeline_def_.cpu_prefetch_queue_depth, pipeline_def_.gpu_prefetch_queue_depth,
         pipeline_def_.enable_memory_stats));
     return Status();
@@ -380,26 +386,28 @@ class DALIDatasetOp::Dataset::Iterator : public DatasetIterator<Dataset> {
   }
 
   ~Iterator() {
-    if (enable_memory_stats_) {
-      size_t N;
-      daliExecutorMetadata *meta;
-      daliGetExecutorMetadata(&pipeline_handle_, &meta, &N);
-      std::cout << "DALI operator memory statistics: " << std::endl;
-      for (size_t i = 0; i < N; ++i) {
-        std::cout << "Operator " << meta[i].operator_name;
-        for (size_t j = 0; j < meta[i].out_num; ++j) {
-          std::cout << "   output [ " << j << " ] : " << meta[i].real_size[j] << "B allocated "
-                    << meta[i].max_real_size[j] << "B max allocated " << meta[i].reserved[j]
-                    << "B reserved" << meta[i].max_reserved[j] << "B max reserved";
-          if (j != meta[i].out_num - 1) {
-            std::cout << ",";
+    if (pipeline_handle_) {
+      if (enable_memory_stats_) {
+        size_t N;
+        daliExecutorMetadata *meta;
+        daliGetExecutorMetadata(&pipeline_handle_, &meta, &N);
+        std::cout << "DALI operator memory statistics: " << std::endl;
+        for (size_t i = 0; i < N; ++i) {
+          std::cout << "Operator " << meta[i].operator_name;
+          for (size_t j = 0; j < meta[i].out_num; ++j) {
+            std::cout << "   output [ " << j << " ] : " << meta[i].real_size[j] << "B allocated "
+                      << meta[i].max_real_size[j] << "B max allocated " << meta[i].reserved[j]
+                      << "B reserved" << meta[i].max_reserved[j] << "B max reserved";
+            if (j != meta[i].out_num - 1) {
+              std::cout << ",";
+            }
           }
+          std::cout << std::endl;
         }
-        std::cout << std::endl;
+        daliFreeExecutorMetadata(meta, N);
       }
-      daliFreeExecutorMetadata(meta, N);
+      daliDeletePipeline(&pipeline_handle_);
     }
-    daliDeletePipeline(&pipeline_handle_);
   }
 
 #if TF_MAJOR_VERSION > 2 || (TF_MAJOR_VERSION == 2 && TF_MINOR_VERSION >= 3)
@@ -941,8 +949,8 @@ class DALIDatasetOp::Dataset::Iterator : public DatasetIterator<Dataset> {
   std::vector<dali_backend_t> input_ext_src_devices_;
   std::queue<ListOfBatches> alive_batches_;
   InputState iterator_state_ = InputState::in_progress;
-  daliPipelineHandle pipeline_handle_;
-  bool enable_memory_stats_;
+  daliPipelineHandle pipeline_handle_ = nullptr;
+  bool enable_memory_stats_ = false;
 };
 
 void DALIDatasetOp::MakeDataset(OpKernelContext *context, DatasetBase **output) {
@@ -959,6 +967,7 @@ void DALIDatasetOp::FillPipelineDef(OpKernelConstruction *context, PipelineDef &
   OP_REQUIRES_OK(context, context->GetAttr(kNumThreads, &def.num_threads));
   OP_REQUIRES_OK(context, context->GetAttr(kDeviceId, &def.device_id));
   OP_REQUIRES_OK(context, context->GetAttr(kExecSeparated, &def.exec_separated));
+  OP_REQUIRES_OK(context, context->GetAttr(kExecDynamic, &def.exec_dynamic));
   OP_REQUIRES_OK(context, context->GetAttr(kPrefetchQueueDepth, &def.prefetch_queue_depth));
   OP_REQUIRES_OK(context, context->GetAttr(kCpuPrefetchQueueDepth, &def.cpu_prefetch_queue_depth));
   OP_REQUIRES_OK(context, context->GetAttr(kGpuPrefetchQueueDepth, &def.gpu_prefetch_queue_depth));
@@ -1079,6 +1088,7 @@ REGISTER_OP("DALIDataset")
     .Attr("num_threads: int")
     .Attr("device_id: int")
     .Attr("exec_separated: bool")
+    .Attr("exec_dynamic: bool")
     .Attr("prefetch_queue_depth: int")
     .Attr("cpu_prefetch_queue_depth: int")
     .Attr("gpu_prefetch_queue_depth: int")
diff --git a/dali_tf_plugin/daliop.cc b/dali_tf_plugin/daliop.cc
index 7d9e1a9ed5..5123ab1355 100644
--- a/dali_tf_plugin/daliop.cc
+++ b/dali_tf_plugin/daliop.cc
@@ -67,6 +67,7 @@ REGISTER_OP("Dali")
   .Attr("num_threads: int = -1")
   .Attr("device_id: int = -1")
   .Attr("exec_separated: bool = false")
+  .Attr("exec_dynamic: bool = false")
   .Attr("gpu_prefetch_queue_depth: int = 2")
   .Attr("cpu_prefetch_queue_depth: int = 2")
   .Attr("sparse: list(bool) = []")
@@ -111,6 +112,7 @@ class DaliOp : public tf::OpKernel {
     int device_id;
     int max_batch_size;
     bool exec_separated;
+    bool exec_dynamic;
     int cpu_prefetch_queue_depth;
 
     OP_REQUIRES_OK(context, context->GetAttr("shapes", &shapes_));
@@ -118,6 +120,7 @@ class DaliOp : public tf::OpKernel {
     OP_REQUIRES_OK(context, context->GetAttr("num_threads", &num_threads));
     OP_REQUIRES_OK(context, context->GetAttr("device_id", &device_id));
     OP_REQUIRES_OK(context, context->GetAttr("exec_separated", &exec_separated));
+    OP_REQUIRES_OK(context, context->GetAttr("exec_dynamic", &exec_dynamic));
     // In exec_separated==false case, gpu_prefetch_queue_depth is the global prefetch_queue_depth_
     OP_REQUIRES_OK(context, context->GetAttr("gpu_prefetch_queue_depth", &prefetch_queue_depth_));
     OP_REQUIRES_OK(context, context->GetAttr("sparse", &sparse_));
@@ -142,13 +145,19 @@ class DaliOp : public tf::OpKernel {
       max_batch_size = shapes_[0].dim_size(0);
     }
 
-    TF_DALI_CALL(daliCreatePipeline(&pipe_handle_,
+    dali_exec_flags_t flags = DALI_EXEC_ASYNC_PIPELINED;
+    if (exec_dynamic)
+      flags = flags | DALI_EXEC_IS_DYNAMIC;
+    if (exec_separated)
+      flags = flags | DALI_EXEC_IS_SEPARATED;
+
+    TF_DALI_CALL(daliCreatePipeline3(&pipe_handle_,
                    serialized_pipeline.c_str(),
                    serialized_pipeline.length(),
                    max_batch_size,
                    num_threads,
                    device_id,
-                   exec_separated,
+                   flags,
                    prefetch_queue_depth_,
                    cpu_prefetch_queue_depth,
                    prefetch_queue_depth_,
@@ -165,28 +174,30 @@ class DaliOp : public tf::OpKernel {
   }
 
   ~DaliOp() override {
-    if (enable_memory_stats_) {
-      size_t N;
-      daliExecutorMetadata *meta;
-      daliGetExecutorMetadata(&pipe_handle_, &meta, &N);
-      std::cout << "DALI operator memory statistics: "  << std::endl;
-      for (size_t i = 0; i < N; ++i) {
-        std::cout << "Operator " << meta[i].operator_name;
-        for (size_t j = 0; j < meta[i].out_num; ++j) {
-          std::cout << "   output [ " << j << " ] : "
-                    << meta[i].real_size[j] << "B allocated "
-                    << meta[i].max_real_size[j] << "B max allocated "
-                    << meta[i].reserved[j] << "B reserved"
-                    << meta[i].max_reserved[j] << "B max reserved";
-          if (j != meta[i].out_num - 1) {
-            std::cout << ",";
+    if (pipe_handle_) {
+      if (enable_memory_stats_) {
+        size_t N;
+        daliExecutorMetadata *meta;
+        daliGetExecutorMetadata(&pipe_handle_, &meta, &N);
+        std::cout << "DALI operator memory statistics: "  << std::endl;
+        for (size_t i = 0; i < N; ++i) {
+          std::cout << "Operator " << meta[i].operator_name;
+          for (size_t j = 0; j < meta[i].out_num; ++j) {
+            std::cout << "   output [ " << j << " ] : "
+                      << meta[i].real_size[j] << "B allocated "
+                      << meta[i].max_real_size[j] << "B max allocated "
+                      << meta[i].reserved[j] << "B reserved"
+                      << meta[i].max_reserved[j] << "B max reserved";
+            if (j != meta[i].out_num - 1) {
+              std::cout << ",";
+            }
           }
+          std::cout << std::endl;
         }
-        std::cout << std::endl;
+        daliFreeExecutorMetadata(meta, N);
       }
-      daliFreeExecutorMetadata(meta, N);
+      daliDeletePipeline(&pipe_handle_);
     }
-    daliDeletePipeline(&pipe_handle_);
   }
 
   void Compute(tf::OpKernelContext* context) override {
@@ -389,15 +400,15 @@ class DaliOp : public tf::OpKernel {
   }
 
  private:
-  daliPipelineHandle pipe_handle_;
+  daliPipelineHandle pipe_handle_ = nullptr;
   std::vector<tf::TensorShape> shapes_;
   tf::DataTypeVector types_;
-  int device_id_;
-  int batch_size_;
-  int prefetch_queue_depth_;
-  device_type_t device_type_;
+  int device_id_ = -1;
+  int batch_size_ = 0;
+  int prefetch_queue_depth_ = -1;
+  device_type_t device_type_ = CPU;
   std::vector<bool> sparse_;
-  bool enable_memory_stats_;
+  bool enable_memory_stats_ = false;
 };
 
 using tf::int64;
diff --git a/qa/TL0_tensorflow_plugin/test.sh b/qa/TL0_tensorflow_plugin/test.sh
index 60092468d7..0f4ef097a4 100755
--- a/qa/TL0_tensorflow_plugin/test.sh
+++ b/qa/TL0_tensorflow_plugin/test.sh
@@ -45,6 +45,9 @@ test_body() {
         ${python_invoke_test} test_dali_tf_dataset_eager.py
         ${python_invoke_test} test_dali_tf_dataset_graph.py
     fi
+
+    # DALI TF + dynamic executor
+    ${python_invoke_test} test_dali_tf_exec2.py
 }
 
 pushd ../..

From bf7a0a5a73de10f412c8a28a909da3f0438574b0 Mon Sep 17 00:00:00 2001
From: Kamil Tokarski <ktokarski@nvidia.com>
Date: Mon, 28 Oct 2024 12:41:08 +0100
Subject: [PATCH 29/29] Update DALI_DEPS_VERSION for new OpenSSL (#5689)

Signed-off-by: Kamil Tokarski <ktokarski@nvidia.com>
---
 DALI_DEPS_VERSION | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/DALI_DEPS_VERSION b/DALI_DEPS_VERSION
index 4ec8cae34a..6d7aac8cb7 100644
--- a/DALI_DEPS_VERSION
+++ b/DALI_DEPS_VERSION
@@ -1 +1 @@
-c7e3e7b996b0a1b19f5e435d32e64c20d9a28a42
+a72649c13fc6282960976760e4b88b1d315d3528