Fix qml.probs for lightning.tensor (#906)

### Before submitting Please complete the following checklist when submitting a PR: - [ ] All new features must include a unit test. If you've fixed a bug or added code that should be tested, add a test to the [`tests`](../tests) directory! - [ ] All new functions and code must be clearly commented and documented. If you do make documentation changes, make sure that the docs build and render correctly by running `make docs`. - [ ] Ensure that the test suite passes, by running `make test`. - [x] Add a new entry to the `.github/CHANGELOG.md` file, summarizing the change, and including a link back to the PR. - [x] Ensure that code is properly formatted by running `make format`. When all the above are checked, delete everything above the dashed line and fill in the pull request template. ------------------------------------------------------------------------------------------------------------ **Context:** [SC-73501] `cutensornet`'s `Accessor` API allows the users to extract single state amplitudes (elements of the state tensor), slices of state amplitudes (slices of the state tensor) as well as the full state tensor. Previous implementation summed up the slices of the state tensor and compute the probabilities, which was a bug. This PR sums up the prob of each slices of the state tensor and fixes the bug. **Description of the Change:** **Benefits:** Analytic probs is supported **Possible Drawbacks:** The computation cost could be too expensive to afford if the number of projected wires is large. **Related GitHub Issues:** --------- Co-authored-by: ringo-but-quantum <github-ringo-but-quantum@xanadu.ai>
PennyLaneAI · Sep 11, 2024 · dbd2cc4 · dbd2cc4
1 parent ef3a8cc
commit dbd2cc4
Show file tree

Hide file tree

Showing 7 changed files with 144 additions and 65 deletions.
diff --git a/.github/CHANGELOG.md b/.github/CHANGELOG.md
@@ -64,6 +64,9 @@
 
 ### Bug fixes
 
+* Bug fix for analytic `probs` in the `lightning.tensor` C++ layer.
+  [(#906)](https://github.com/PennyLaneAI/pennylane-lightning/pull/906)
+
 ### Contributors
 
 This release contains contributions from (in alphabetical order):

diff --git a/pennylane_lightning/core/_version.py b/pennylane_lightning/core/_version.py
@@ -16,4 +16,4 @@
    Version number (major.minor.patch[-label])
 """
 
-__version__ = "0.39.0-dev20"
+__version__ = "0.39.0-dev21"
diff --git a/pennylane_lightning/core/src/simulators/lightning_tensor/tncuda/TNCudaBase.hpp b/pennylane_lightning/core/src/simulators/lightning_tensor/tncuda/TNCudaBase.hpp
@@ -406,69 +406,36 @@ class TNCudaBase : public TensornetBase<PrecisionT, Derived> {
      */
     void get_state_tensor(ComplexT *host_data,
                           const int32_t numHyperSamples = 1) {
-        std::vector<std::size_t> wires(BaseType::getNumQubits());
-        std::iota(wires.begin(), wires.end(), 0);
+        std::vector<int32_t> projected_modes{};
+        std::vector<int64_t> projected_mode_values{};
 
-        const std::size_t length = std::size_t{1} << wires.size();
+        const std::size_t length = std::size_t{1} << BaseType::getNumQubits();
 
         DataBuffer<CFP_t, int> d_output_tensor(length, getDevTag(), true);
 
-        get_state_tensor(d_output_tensor.getData(), d_output_tensor.getLength(),
-                         wires, numHyperSamples);
+        get_accessor_(d_output_tensor.getData(), length, projected_modes,
+                      projected_mode_values, numHyperSamples);
 
         d_output_tensor.CopyGpuDataToHost(host_data, length);
     }
 
     /**
-     * @brief Get a slice of the full state tensor
+     * @brief Get a slice of the full state tensor.
      *
      * @param tensor_data Pointer to the device memory for state tensor data.
      * @param tensor_data_size Size of the state tensor data.
-     * @param wires Wires to get the state tensor for.
+     * @param projected_modes Projected modes to get the state tensor for.
+     * @param projected_mode_values Values of the projected modes.
      * @param numHyperSamples Number of hyper samples to use in the calculation
      * and is set to 1 by default.
      */
     void get_state_tensor(CFP_t *tensor_data,
                           const std::size_t tensor_data_size,
-                          const std::vector<std::size_t> &wires,
+                          const std::vector<int32_t> &projected_modes,
+                          const std::vector<int64_t> &projected_mode_values,
                           const int32_t numHyperSamples = 1) const {
-        auto stateModes = cuUtil::NormalizeCastIndices<std::size_t, int32_t>(
-            wires, BaseType::getNumQubits());
-
-        std::vector<int32_t> projected_modes{};
-
-        for (int32_t idx = 0;
-             idx < static_cast<int32_t>(BaseType::getNumQubits()); idx++) {
-            auto it = std::find(stateModes.begin(), stateModes.end(), idx);
-            if (it == stateModes.end()) {
-                projected_modes.emplace_back(idx);
-            }
-        }
-
-        std::vector<int64_t> projectedModeValues(projected_modes.size(), 0);
-
-        if (projected_modes.empty()) {
-            get_accessor_(tensor_data, tensor_data_size, projected_modes,
-                          projectedModeValues, numHyperSamples);
-        } else {
-            DataBuffer<CFP_t, int> tmp(tensor_data_size, getDevTag(), true);
-
-            const std::size_t projected_modes_size = std::size_t(1)
-                                                     << projected_modes.size();
-            for (std::size_t idx = 0; idx < projected_modes_size; idx++) {
-                for (std::size_t j = 0; j < projected_modes.size(); j++) {
-                    projectedModeValues[j] = (idx >> j) & 1;
-                }
-
-                get_accessor_(tmp.getData(), tensor_data_size, projected_modes,
-                              projectedModeValues, numHyperSamples);
-                // Copy the data to the output tensor
-                scaleAndAddC_CUDA(std::complex<PrecisionT>{1.0, 0.0},
-                                  tmp.getData(), tensor_data, tmp.getLength(),
-                                  getDevTag().getDeviceID(),
-                                  getDevTag().getStreamID(), getCublasCaller());
-            }
-        }
+        get_accessor_(tensor_data, tensor_data_size, projected_modes,
+                      projected_mode_values, numHyperSamples);
     }
 
   private:
@@ -478,13 +445,13 @@ class TNCudaBase : public TensornetBase<PrecisionT, Derived> {
      * @param tensor_data Pointer to the device memory for state tensor data.
      * @param tensor_data_size Size of the tensor data.
      * @param projected_modes Projected modes to get the state tensor for.
-     * @param projectedModeValues Values of the projected modes.
+     * @param projected_mode_values Values of the projected modes.
      * @param numHyperSamples Number of hyper samples to use in the calculation
      * and is set to 1 by default.
      */
     void get_accessor_(CFP_t *tensor_data, const std::size_t tensor_data_size,
                        const std::vector<int32_t> &projected_modes,
-                       const std::vector<int64_t> &projectedModeValues,
+                       const std::vector<int64_t> &projected_mode_values,
                        const int32_t numHyperSamples = 1) const {
         cutensornetStateAccessor_t accessor;
         PL_CUTENSORNET_IS_SUCCESS(cutensornetCreateAccessor(
@@ -543,7 +510,7 @@ class TNCudaBase : public TensornetBase<PrecisionT, Derived> {
             /* const cutensornetHandle_t */ getTNCudaHandle(),
             /* cutensornetStateAccessor_t */ accessor,
             /* const int64_t * projectedModeValues */
-            projectedModeValues.data(),
+            projected_mode_values.data(),
             /* cutensornetWorkspaceDescriptor_t */ workDesc,
             /* void *amplitudesTensor*/
             static_cast<void *>(tensor_data),

diff --git a/...lightning/core/src/simulators/lightning_tensor/tncuda/measurements/MeasurementsTNCuda.hpp b/...lightning/core/src/simulators/lightning_tensor/tncuda/measurements/MeasurementsTNCuda.hpp
@@ -107,22 +107,73 @@ template <class TensorNetT> class MeasurementsTNCuda {
         DataBuffer<CFP_t, int> d_output_tensor(
             length, tensor_network_.getDevTag(), true);
 
+        DataBuffer<PrecisionT, int> d_output_probs(
+            length, tensor_network_.getDevTag(), true);
+
         d_output_tensor.zeroInit();
+        d_output_probs.zeroInit();
 
-        tensor_network_.get_state_tensor(d_output_tensor.getData(),
-                                         d_output_tensor.getLength(), wires,
-                                         numHyperSamples);
+        auto stateModes = cuUtil::NormalizeCastIndices<std::size_t, int32_t>(
+            wires, tensor_network_.getNumQubits());
 
-        // `10` here means `1024` elements to be calculated
-        // LCOV_EXCL_START
-        if (wires.size() > 10) {
-            DataBuffer<PrecisionT, int> d_output_probs(
-                length, tensor_network_.getDevTag(), true);
+        std::vector<int32_t> projected_modes{};
 
+        for (int32_t idx = 0;
+             idx < static_cast<int32_t>(tensor_network_.getNumQubits());
+             idx++) {
+            auto it = std::find(stateModes.begin(), stateModes.end(), idx);
+            if (it == stateModes.end()) {
+                projected_modes.emplace_back(idx);
+            }
+        }
+
+        std::vector<int64_t> projectedModeValues(projected_modes.size(), 0);
+
+        if (projected_modes.size() == 0) {
+            tensor_network_.get_state_tensor(d_output_tensor.getData(),
+                                             d_output_tensor.getLength(), {},
+                                             {}, numHyperSamples);
             getProbs_CUDA(d_output_tensor.getData(), d_output_probs.getData(),
                           length, static_cast<int>(thread_per_block),
                           tensor_network_.getDevTag().getStreamID());
 
+        } else {
+            PL_ABORT_IF(projected_modes.size() > 64,
+                        "Number of projected modes is greater than 64 and the "
+                        "value of projected_modes_size will exceed "
+                        "std::numeric_limits<size_t>::max()");
+            const std::size_t projected_modes_size = std::size_t(1U)
+                                                     << projected_modes.size();
+
+            DataBuffer<PrecisionT, int> tmp_probs(
+                length, tensor_network_.getDevTag(), true);
+
+            for (std::size_t idx = 0; idx < projected_modes_size; idx++) {
+                for (std::size_t j = 0; j < projected_modes.size(); j++) {
+                    projectedModeValues[j] = (idx >> j) & 1U;
+                }
+
+                tensor_network_.get_state_tensor(
+                    d_output_tensor.getData(), length, projected_modes,
+                    projectedModeValues, numHyperSamples);
+
+                getProbs_CUDA(d_output_tensor.getData(), tmp_probs.getData(),
+                              length, static_cast<int>(thread_per_block),
+                              tensor_network_.getDevTag().getStreamID());
+
+                // Copy the data to the output tensor
+                scaleAndAdd_CUDA(PrecisionT{1.0}, tmp_probs.getData(),
+                                 d_output_probs.getData(),
+                                 tmp_probs.getLength(),
+                                 tensor_network_.getDevTag().getDeviceID(),
+                                 tensor_network_.getDevTag().getStreamID(),
+                                 tensor_network_.getCublasCaller());
+            }
+        }
+
+        // `10` here means `1024` elements to be calculated
+        // LCOV_EXCL_START
+        if (wires.size() > 10) {
             PrecisionT sum;
 
             asum_CUDA_device<PrecisionT>(
@@ -144,16 +195,11 @@ template <class TensorNetT> class MeasurementsTNCuda {
             // number of wires. The CPU calculation is faster than the GPU
             // calculation for a small number of wires due to the overhead of
             // the GPU kernel launch.
-            std::vector<ComplexT> h_state_vector(length);
-            d_output_tensor.CopyGpuDataToHost(h_state_vector.data(),
-                                              h_state_vector.size());
-            // TODO: OMP support
-            for (std::size_t i = 0; i < length; i++) {
-                h_res[i] = std::norm(h_state_vector[i]);
-            }
+            d_output_probs.CopyGpuDataToHost(h_res.data(), h_res.size());
 
             // TODO: OMP support
-            PrecisionT sum = std::accumulate(h_res.begin(), h_res.end(), 0.0);
+            PrecisionT sum =
+                std::accumulate(h_res.begin(), h_res.end(), PrecisionT{0.0});
 
             PL_ABORT_IF(sum == 0.0, "Sum of probabilities is zero.");
             // TODO: OMP support

diff --git a/...core/src/simulators/lightning_tensor/tncuda/measurements/tests/Test_MPSTNCuda_Measure.cpp b/...core/src/simulators/lightning_tensor/tncuda/measurements/tests/Test_MPSTNCuda_Measure.cpp
@@ -93,6 +93,18 @@ TEMPLATE_TEST_CASE("Probabilities", "[Measures]", float, double) {
         auto measure = MeasurementsTNCuda<TensorNetT>(mps_state);
         REQUIRE_THROWS_AS(measure.probs({2, 1}), LightningException);
     }
+
+    SECTION("Test excessive projected wires failure") {
+        // Defining the State Vector that will be measured.
+        std::size_t bondDim = GENERATE(2, 3, 4, 5);
+        std::size_t num_qubits = 100;
+        std::size_t maxBondDim = bondDim;
+
+        TensorNetT mps_state{num_qubits, maxBondDim};
+
+        auto measure = MeasurementsTNCuda<TensorNetT>(mps_state);
+        REQUIRE_THROWS_AS(measure.probs({0, 1, 2, 3}), LightningException);
+    }
 }
 
 TEMPLATE_TEST_CASE("Samples", "[Measures]", float, double) {

diff --git a/pennylane_lightning/core/src/utils/cuda_utils/LinearAlg.hpp b/pennylane_lightning/core/src/utils/cuda_utils/LinearAlg.hpp
@@ -215,6 +215,34 @@ inline auto scaleAndAddC_CUDA(const CFP_t a, const T *v1, T *v2,
     }
 }
 
+/**
+ * @brief cuBLAS backed GPU SAXPY/DAXPY.
+ *
+ * @tparam T Float data-type. Accepts float and double
+ * @param a scaling factor
+ * @param v1 Device data pointer 1 (data to be modified)
+ * @param v2 Device data pointer 2 (the result data)
+ * @param data_size Length of device data.
+ * @param dev_id the device on which the function should be executed.
+ * @param stream_id the CUDA stream on which the operation should be executed.
+ * @param cublas the CublasCaller object that manages the cuBLAS handle.
+ */
+
+template <class T = double, class DevTypeID = int>
+inline auto scaleAndAdd_CUDA(const T a, const T *v1, T *v2, const int data_size,
+                             DevTypeID dev_id, cudaStream_t stream_id,
+                             const CublasCaller &cublas) {
+    if constexpr (std::is_same_v<T, float>) {
+        const float alpha = a;
+        cublas.call(cublasSaxpy, dev_id, stream_id, data_size, &alpha, v1, 1,
+                    v2, 1);
+    } else if constexpr (std::is_same_v<T, double>) {
+        const double alpha = a;
+        cublas.call(cublasDaxpy, dev_id, stream_id, data_size, &alpha, v1, 1,
+                    v2, 1);
+    }
+}
+
 /**
  * @brief cuBLAS backed GPU data scaling.
  *

diff --git a/tests/lightning_tensor/test_measurements_class.py b/tests/lightning_tensor/test_measurements_class.py
@@ -126,3 +126,26 @@ def test_not_supported_shadowmp_shot_measurements(self):
 
             with pytest.raises(TypeError):
                 m.measure_tensor_network(tape)
+
+    @pytest.mark.parametrize("n_qubits", range(4, 14, 2))
+    @pytest.mark.parametrize("n_targets", list(range(1, 4)) + list(range(4, 14, 2)))
+    def test_probs_many_wires(self, n_qubits, n_targets, tol):
+        """Test probs measuring many wires of a random quantum state."""
+        if n_targets >= n_qubits:
+            pytest.skip("Number of targets cannot exceed the number of wires.")
+
+        dev = qml.device(device_name, wires=n_qubits)
+        dq = qml.device("default.qubit", wires=n_qubits)
+
+        init_state = np.random.rand(2**n_qubits) + 1.0j * np.random.rand(2**n_qubits)
+        init_state /= np.linalg.norm(init_state)
+
+        ops = [qml.StatePrep(init_state, wires=range(n_qubits))]
+
+        mp = qml.probs(wires=range(n_targets))
+
+        tape = qml.tape.QuantumScript(ops, [mp])
+        res = dev.execute(tape)
+        ref = dq.execute(tape)
+
+        assert np.allclose(res, ref, atol=tol, rtol=0)