PennyLaneAI · multiphaseCFD · Sep 11, 2024 · Sep 11, 2024 · Sep 11, 2024 · Sep 11, 2024
diff --git a/.github/CHANGELOG.md b/.github/CHANGELOG.md
@@ -61,6 +61,9 @@
 
 ### Bug fixes
 
+* Bug fix for analytic `probs` in the `lightning.tensor` C++ layer.
+  [(#906)](https://github.com/PennyLaneAI/pennylane-lightning/pull/906)
+
 ### Contributors
 
 This release contains contributions from (in alphabetical order):

diff --git a/pennylane_lightning/core/_version.py b/pennylane_lightning/core/_version.py
@@ -16,4 +16,4 @@
    Version number (major.minor.patch[-label])
 """
 
-__version__ = "0.39.0-dev19"
+__version__ = "0.39.0-dev20"
diff --git a/pennylane_lightning/core/src/simulators/lightning_tensor/tncuda/TNCudaBase.hpp b/pennylane_lightning/core/src/simulators/lightning_tensor/tncuda/TNCudaBase.hpp
@@ -406,69 +406,36 @@ class TNCudaBase : public TensornetBase<PrecisionT, Derived> {
      */
     void get_state_tensor(ComplexT *host_data,
                           const int32_t numHyperSamples = 1) {
-        std::vector<std::size_t> wires(BaseType::getNumQubits());
-        std::iota(wires.begin(), wires.end(), 0);
+        std::vector<int32_t> projected_modes{};
+        std::vector<int64_t> projected_mode_values{};
 
-        const std::size_t length = std::size_t{1} << wires.size();
+        const std::size_t length = std::size_t{1} << BaseType::getNumQubits();
 
         DataBuffer<CFP_t, int> d_output_tensor(length, getDevTag(), true);
 
-        get_state_tensor(d_output_tensor.getData(), d_output_tensor.getLength(),
-                         wires, numHyperSamples);
+        get_accessor_(d_output_tensor.getData(), length, projected_modes,
+                      projected_mode_values, numHyperSamples);
 
         d_output_tensor.CopyGpuDataToHost(host_data, length);
     }
 
     /**
-     * @brief Get a slice of the full state tensor
+     * @brief Get a slice of the full state tensor.
      *
      * @param tensor_data Pointer to the device memory for state tensor data.
      * @param tensor_data_size Size of the state tensor data.
-     * @param wires Wires to get the state tensor for.
+     * @param projected_modes Projected modes to get the state tensor for.
+     * @param projected_mode_values Values of the projected modes.
      * @param numHyperSamples Number of hyper samples to use in the calculation
      * and is set to 1 by default.
      */
     void get_state_tensor(CFP_t *tensor_data,
                           const std::size_t tensor_data_size,
-                          const std::vector<std::size_t> &wires,
+                          const std::vector<int32_t> &projected_modes,
+                          const std::vector<int64_t> &projected_mode_values,
                           const int32_t numHyperSamples = 1) const {
-        auto stateModes = cuUtil::NormalizeCastIndices<std::size_t, int32_t>(
-            wires, BaseType::getNumQubits());
-
-        std::vector<int32_t> projected_modes{};
-
-        for (int32_t idx = 0;
-             idx < static_cast<int32_t>(BaseType::getNumQubits()); idx++) {
-            auto it = std::find(stateModes.begin(), stateModes.end(), idx);
-            if (it == stateModes.end()) {
-                projected_modes.emplace_back(idx);
-            }
-        }
-
-        std::vector<int64_t> projectedModeValues(projected_modes.size(), 0);
-
-        if (projected_modes.empty()) {
-            get_accessor_(tensor_data, tensor_data_size, projected_modes,
-                          projectedModeValues, numHyperSamples);
-        } else {
-            DataBuffer<CFP_t, int> tmp(tensor_data_size, getDevTag(), true);
-
-            const std::size_t projected_modes_size = std::size_t(1)
-                                                     << projected_modes.size();
-            for (std::size_t idx = 0; idx < projected_modes_size; idx++) {
-                for (std::size_t j = 0; j < projected_modes.size(); j++) {
-                    projectedModeValues[j] = (idx >> j) & 1;
-                }
-
-                get_accessor_(tmp.getData(), tensor_data_size, projected_modes,
-                              projectedModeValues, numHyperSamples);
-                // Copy the data to the output tensor
-                scaleAndAddC_CUDA(std::complex<PrecisionT>{1.0, 0.0},
-                                  tmp.getData(), tensor_data, tmp.getLength(),
-                                  getDevTag().getDeviceID(),
-                                  getDevTag().getStreamID(), getCublasCaller());
-            }
-        }
+        get_accessor_(tensor_data, tensor_data_size, projected_modes,
+                      projected_mode_values, numHyperSamples);
     }
 
   private:
@@ -478,13 +445,13 @@ class TNCudaBase : public TensornetBase<PrecisionT, Derived> {
      * @param tensor_data Pointer to the device memory for state tensor data.
      * @param tensor_data_size Size of the tensor data.
      * @param projected_modes Projected modes to get the state tensor for.
-     * @param projectedModeValues Values of the projected modes.
+     * @param projected_mode_values Values of the projected modes.
      * @param numHyperSamples Number of hyper samples to use in the calculation
      * and is set to 1 by default.
      */
     void get_accessor_(CFP_t *tensor_data, const std::size_t tensor_data_size,
                        const std::vector<int32_t> &projected_modes,
-                       const std::vector<int64_t> &projectedModeValues,
+                       const std::vector<int64_t> &projected_mode_values,
                        const int32_t numHyperSamples = 1) const {
         cutensornetStateAccessor_t accessor;
         PL_CUTENSORNET_IS_SUCCESS(cutensornetCreateAccessor(
@@ -543,7 +510,7 @@ class TNCudaBase : public TensornetBase<PrecisionT, Derived> {
             /* const cutensornetHandle_t */ getTNCudaHandle(),
             /* cutensornetStateAccessor_t */ accessor,
             /* const int64_t * projectedModeValues */
-            projectedModeValues.data(),
+            projected_mode_values.data(),
             /* cutensornetWorkspaceDescriptor_t */ workDesc,
             /* void *amplitudesTensor*/
             static_cast<void *>(tensor_data),

diff --git a/...lightning/core/src/simulators/lightning_tensor/tncuda/measurements/MeasurementsTNCuda.hpp b/...lightning/core/src/simulators/lightning_tensor/tncuda/measurements/MeasurementsTNCuda.hpp
@@ -107,22 +107,71 @@ template <class TensorNetT> class MeasurementsTNCuda {
         DataBuffer<CFP_t, int> d_output_tensor(
             length, tensor_network_.getDevTag(), true);
 
+        DataBuffer<PrecisionT, int> d_output_probs(
+            length, tensor_network_.getDevTag(), true);
+
         d_output_tensor.zeroInit();
+        d_output_probs.zeroInit();
 
-        tensor_network_.get_state_tensor(d_output_tensor.getData(),
-                                         d_output_tensor.getLength(), wires,
-                                         numHyperSamples);
+        auto stateModes = cuUtil::NormalizeCastIndices<std::size_t, int32_t>(
+            wires, tensor_network_.getNumQubits());
 
-        // `10` here means `1024` elements to be calculated
-        // LCOV_EXCL_START
-        if (wires.size() > 10) {
-            DataBuffer<PrecisionT, int> d_output_probs(
-                length, tensor_network_.getDevTag(), true);
+        std::vector<int32_t> projected_modes{};
 
+        for (int32_t idx = 0;
+             idx < static_cast<int32_t>(tensor_network_.getNumQubits());
+             idx++) {
+            auto it = std::find(stateModes.begin(), stateModes.end(), idx);
+            if (it == stateModes.end()) {
+                projected_modes.emplace_back(idx);
+            }
+        }
+
+        std::vector<int64_t> projectedModeValues(projected_modes.size(), 0);
+
+        if (projected_modes.size() == 0) {
+            tensor_network_.get_state_tensor(d_output_tensor.getData(),
+                                             d_output_tensor.getLength(), {},
+                                             {}, numHyperSamples);
             getProbs_CUDA(d_output_tensor.getData(), d_output_probs.getData(),
                           length, static_cast<int>(thread_per_block),
                           tensor_network_.getDevTag().getStreamID());
 
+        } else {
+            PL_ABORT_IF(projected_modes.size() > 64,
+                        "Number of projected modes is greater than 64.");
+            const std::size_t projected_modes_size = std::size_t(1)
+                                                     << projected_modes.size();
+
+            DataBuffer<PrecisionT, int> tmp_probs(
+                length, tensor_network_.getDevTag(), true);
+
+            for (std::size_t idx = 0; idx < projected_modes_size; idx++) {
+                for (std::size_t j = 0; j < projected_modes.size(); j++) {
+                    projectedModeValues[j] = (idx >> j) & 1;
+                }
+
+                tensor_network_.get_state_tensor(
+                    d_output_tensor.getData(), length, projected_modes,
+                    projectedModeValues, numHyperSamples);
+
+                getProbs_CUDA(d_output_tensor.getData(), tmp_probs.getData(),
+                              length, static_cast<int>(thread_per_block),
+                              tensor_network_.getDevTag().getStreamID());
+
+                // Copy the data to the output tensor
+                scaleAndAdd_CUDA(PrecisionT{1.0}, tmp_probs.getData(),
+                                 d_output_probs.getData(),
+                                 tmp_probs.getLength(),
+                                 tensor_network_.getDevTag().getDeviceID(),
+                                 tensor_network_.getDevTag().getStreamID(),
+                                 tensor_network_.getCublasCaller());
+            }
+        }
+
+        // `10` here means `1024` elements to be calculated
+        // LCOV_EXCL_START
+        if (wires.size() > 10) {
             PrecisionT sum;
 
             asum_CUDA_device<PrecisionT>(
@@ -144,16 +193,11 @@ template <class TensorNetT> class MeasurementsTNCuda {
             // number of wires. The CPU calculation is faster than the GPU
             // calculation for a small number of wires due to the overhead of
             // the GPU kernel launch.
-            std::vector<ComplexT> h_state_vector(length);
-            d_output_tensor.CopyGpuDataToHost(h_state_vector.data(),
-                                              h_state_vector.size());
-            // TODO: OMP support
-            for (std::size_t i = 0; i < length; i++) {
-                h_res[i] = std::norm(h_state_vector[i]);
-            }
+            d_output_probs.CopyGpuDataToHost(h_res.data(), h_res.size());
 
             // TODO: OMP support
-            PrecisionT sum = std::accumulate(h_res.begin(), h_res.end(), 0.0);
+            PrecisionT sum =
+                std::accumulate(h_res.begin(), h_res.end(), PrecisionT{0.0});
 
             PL_ABORT_IF(sum == 0.0, "Sum of probabilities is zero.");
             // TODO: OMP support

diff --git a/...core/src/simulators/lightning_tensor/tncuda/measurements/tests/Test_MPSTNCuda_Measure.cpp b/...core/src/simulators/lightning_tensor/tncuda/measurements/tests/Test_MPSTNCuda_Measure.cpp
@@ -93,6 +93,18 @@ TEMPLATE_TEST_CASE("Probabilities", "[Measures]", float, double) {
         auto measure = MeasurementsTNCuda<TensorNetT>(mps_state);
         REQUIRE_THROWS_AS(measure.probs({2, 1}), LightningException);
     }
+
+    SECTION("Test excessive projected wires failure") {
+        // Defining the State Vector that will be measured.
+        std::size_t bondDim = GENERATE(2, 3, 4, 5);
+        std::size_t num_qubits = 100;
+        std::size_t maxBondDim = bondDim;
+
+        TensorNetT mps_state{num_qubits, maxBondDim};
+
+        auto measure = MeasurementsTNCuda<TensorNetT>(mps_state);
+        REQUIRE_THROWS_AS(measure.probs({0, 1, 2, 3}), LightningException);
+    }
 }
 
 TEMPLATE_TEST_CASE("Samples", "[Measures]", float, double) {

diff --git a/pennylane_lightning/core/src/utils/cuda_utils/LinearAlg.hpp b/pennylane_lightning/core/src/utils/cuda_utils/LinearAlg.hpp
@@ -215,6 +215,34 @@ inline auto scaleAndAddC_CUDA(const CFP_t a, const T *v1, T *v2,
     }
 }
 
+/**
+ * @brief cuBLAS backed GPU SAXPY/DAXPY.
+ *
+ * @tparam T Float data-type. Accepts float and double
+ * @param a scaling factor
+ * @param v1 Device data pointer 1 (data to be modified)
+ * @param v2 Device data pointer 2 (the result data)
+ * @param data_size Length of device data.
+ * @param dev_id the device on which the function should be executed.
+ * @param stream_id the CUDA stream on which the operation should be executed.
+ * @param cublas the CublasCaller object that manages the cuBLAS handle.
+ */
+
+template <class T = double, class DevTypeID = int>
+inline auto scaleAndAdd_CUDA(const T a, const T *v1, T *v2, const int data_size,
+                             DevTypeID dev_id, cudaStream_t stream_id,
+                             const CublasCaller &cublas) {
+    if constexpr (std::is_same_v<T, float>) {
+        const float alpha = a;
+        cublas.call(cublasSaxpy, dev_id, stream_id, data_size, &alpha, v1, 1,
+                    v2, 1);
+    } else if constexpr (std::is_same_v<T, double>) {
+        const double alpha = a;
+        cublas.call(cublasDaxpy, dev_id, stream_id, data_size, &alpha, v1, 1,
+                    v2, 1);
+    }
+}
+
 /**
  * @brief cuBLAS backed GPU data scaling.
  *

diff --git a/tests/lightning_tensor/test_measurements_class.py b/tests/lightning_tensor/test_measurements_class.py
@@ -126,3 +126,26 @@ def test_not_supported_shadowmp_shot_measurements(self):
 
             with pytest.raises(TypeError):
                 m.measure_tensor_network(tape)
+
+    @pytest.mark.parametrize("n_qubits", range(4, 14, 2))
+    @pytest.mark.parametrize("n_targets", list(range(1, 4)) + list(range(4, 14, 2)))
+    def test_probs_many_wires(self, n_qubits, n_targets, tol):
+        """Test probs measuring many wires of a random quantum state."""
+        if n_targets >= n_qubits:
+            pytest.skip("Number of targets cannot exceed the number of wires.")
+
+        dev = qml.device(device_name, wires=n_qubits)
+        dq = qml.device("default.qubit", wires=n_qubits)
+
+        init_state = np.random.rand(2**n_qubits) + 1.0j * np.random.rand(2**n_qubits)
+        init_state /= np.linalg.norm(init_state)
+
+        ops = [qml.StatePrep(init_state, wires=range(n_qubits))]
+
+        mp = qml.probs(wires=range(n_targets))
+
+        tape = qml.tape.QuantumScript(ops, [mp])
+        res = dev.execute(tape)
+        ref = dq.execute(tape)
+
+        assert np.allclose(res, ref, atol=tol, rtol=0)