diff --git a/.github/CHANGELOG.md b/.github/CHANGELOG.md index 05c449a3ff..49febba26a 100644 --- a/.github/CHANGELOG.md +++ b/.github/CHANGELOG.md @@ -64,6 +64,9 @@ ### Bug fixes +* Bug fix for analytic `probs` in the `lightning.tensor` C++ layer. + [(#906)](https://github.com/PennyLaneAI/pennylane-lightning/pull/906) + ### Contributors This release contains contributions from (in alphabetical order): diff --git a/pennylane_lightning/core/_version.py b/pennylane_lightning/core/_version.py index e6f36589a2..d905c8af96 100644 --- a/pennylane_lightning/core/_version.py +++ b/pennylane_lightning/core/_version.py @@ -16,4 +16,4 @@ Version number (major.minor.patch[-label]) """ -__version__ = "0.39.0-dev20" +__version__ = "0.39.0-dev21" diff --git a/pennylane_lightning/core/src/simulators/lightning_tensor/tncuda/TNCudaBase.hpp b/pennylane_lightning/core/src/simulators/lightning_tensor/tncuda/TNCudaBase.hpp index a06097ff51..1ed1382237 100644 --- a/pennylane_lightning/core/src/simulators/lightning_tensor/tncuda/TNCudaBase.hpp +++ b/pennylane_lightning/core/src/simulators/lightning_tensor/tncuda/TNCudaBase.hpp @@ -406,69 +406,36 @@ class TNCudaBase : public TensornetBase { */ void get_state_tensor(ComplexT *host_data, const int32_t numHyperSamples = 1) { - std::vector wires(BaseType::getNumQubits()); - std::iota(wires.begin(), wires.end(), 0); + std::vector projected_modes{}; + std::vector projected_mode_values{}; - const std::size_t length = std::size_t{1} << wires.size(); + const std::size_t length = std::size_t{1} << BaseType::getNumQubits(); DataBuffer d_output_tensor(length, getDevTag(), true); - get_state_tensor(d_output_tensor.getData(), d_output_tensor.getLength(), - wires, numHyperSamples); + get_accessor_(d_output_tensor.getData(), length, projected_modes, + projected_mode_values, numHyperSamples); d_output_tensor.CopyGpuDataToHost(host_data, length); } /** - * @brief Get a slice of the full state tensor + * @brief Get a slice of the full state tensor. * * @param tensor_data Pointer to the device memory for state tensor data. * @param tensor_data_size Size of the state tensor data. - * @param wires Wires to get the state tensor for. + * @param projected_modes Projected modes to get the state tensor for. + * @param projected_mode_values Values of the projected modes. * @param numHyperSamples Number of hyper samples to use in the calculation * and is set to 1 by default. */ void get_state_tensor(CFP_t *tensor_data, const std::size_t tensor_data_size, - const std::vector &wires, + const std::vector &projected_modes, + const std::vector &projected_mode_values, const int32_t numHyperSamples = 1) const { - auto stateModes = cuUtil::NormalizeCastIndices( - wires, BaseType::getNumQubits()); - - std::vector projected_modes{}; - - for (int32_t idx = 0; - idx < static_cast(BaseType::getNumQubits()); idx++) { - auto it = std::find(stateModes.begin(), stateModes.end(), idx); - if (it == stateModes.end()) { - projected_modes.emplace_back(idx); - } - } - - std::vector projectedModeValues(projected_modes.size(), 0); - - if (projected_modes.empty()) { - get_accessor_(tensor_data, tensor_data_size, projected_modes, - projectedModeValues, numHyperSamples); - } else { - DataBuffer tmp(tensor_data_size, getDevTag(), true); - - const std::size_t projected_modes_size = std::size_t(1) - << projected_modes.size(); - for (std::size_t idx = 0; idx < projected_modes_size; idx++) { - for (std::size_t j = 0; j < projected_modes.size(); j++) { - projectedModeValues[j] = (idx >> j) & 1; - } - - get_accessor_(tmp.getData(), tensor_data_size, projected_modes, - projectedModeValues, numHyperSamples); - // Copy the data to the output tensor - scaleAndAddC_CUDA(std::complex{1.0, 0.0}, - tmp.getData(), tensor_data, tmp.getLength(), - getDevTag().getDeviceID(), - getDevTag().getStreamID(), getCublasCaller()); - } - } + get_accessor_(tensor_data, tensor_data_size, projected_modes, + projected_mode_values, numHyperSamples); } private: @@ -478,13 +445,13 @@ class TNCudaBase : public TensornetBase { * @param tensor_data Pointer to the device memory for state tensor data. * @param tensor_data_size Size of the tensor data. * @param projected_modes Projected modes to get the state tensor for. - * @param projectedModeValues Values of the projected modes. + * @param projected_mode_values Values of the projected modes. * @param numHyperSamples Number of hyper samples to use in the calculation * and is set to 1 by default. */ void get_accessor_(CFP_t *tensor_data, const std::size_t tensor_data_size, const std::vector &projected_modes, - const std::vector &projectedModeValues, + const std::vector &projected_mode_values, const int32_t numHyperSamples = 1) const { cutensornetStateAccessor_t accessor; PL_CUTENSORNET_IS_SUCCESS(cutensornetCreateAccessor( @@ -543,7 +510,7 @@ class TNCudaBase : public TensornetBase { /* const cutensornetHandle_t */ getTNCudaHandle(), /* cutensornetStateAccessor_t */ accessor, /* const int64_t * projectedModeValues */ - projectedModeValues.data(), + projected_mode_values.data(), /* cutensornetWorkspaceDescriptor_t */ workDesc, /* void *amplitudesTensor*/ static_cast(tensor_data), diff --git a/pennylane_lightning/core/src/simulators/lightning_tensor/tncuda/measurements/MeasurementsTNCuda.hpp b/pennylane_lightning/core/src/simulators/lightning_tensor/tncuda/measurements/MeasurementsTNCuda.hpp index 34c22995f5..146f29f1b9 100644 --- a/pennylane_lightning/core/src/simulators/lightning_tensor/tncuda/measurements/MeasurementsTNCuda.hpp +++ b/pennylane_lightning/core/src/simulators/lightning_tensor/tncuda/measurements/MeasurementsTNCuda.hpp @@ -107,22 +107,73 @@ template class MeasurementsTNCuda { DataBuffer d_output_tensor( length, tensor_network_.getDevTag(), true); + DataBuffer d_output_probs( + length, tensor_network_.getDevTag(), true); + d_output_tensor.zeroInit(); + d_output_probs.zeroInit(); - tensor_network_.get_state_tensor(d_output_tensor.getData(), - d_output_tensor.getLength(), wires, - numHyperSamples); + auto stateModes = cuUtil::NormalizeCastIndices( + wires, tensor_network_.getNumQubits()); - // `10` here means `1024` elements to be calculated - // LCOV_EXCL_START - if (wires.size() > 10) { - DataBuffer d_output_probs( - length, tensor_network_.getDevTag(), true); + std::vector projected_modes{}; + for (int32_t idx = 0; + idx < static_cast(tensor_network_.getNumQubits()); + idx++) { + auto it = std::find(stateModes.begin(), stateModes.end(), idx); + if (it == stateModes.end()) { + projected_modes.emplace_back(idx); + } + } + + std::vector projectedModeValues(projected_modes.size(), 0); + + if (projected_modes.size() == 0) { + tensor_network_.get_state_tensor(d_output_tensor.getData(), + d_output_tensor.getLength(), {}, + {}, numHyperSamples); getProbs_CUDA(d_output_tensor.getData(), d_output_probs.getData(), length, static_cast(thread_per_block), tensor_network_.getDevTag().getStreamID()); + } else { + PL_ABORT_IF(projected_modes.size() > 64, + "Number of projected modes is greater than 64 and the " + "value of projected_modes_size will exceed " + "std::numeric_limits::max()"); + const std::size_t projected_modes_size = std::size_t(1U) + << projected_modes.size(); + + DataBuffer tmp_probs( + length, tensor_network_.getDevTag(), true); + + for (std::size_t idx = 0; idx < projected_modes_size; idx++) { + for (std::size_t j = 0; j < projected_modes.size(); j++) { + projectedModeValues[j] = (idx >> j) & 1U; + } + + tensor_network_.get_state_tensor( + d_output_tensor.getData(), length, projected_modes, + projectedModeValues, numHyperSamples); + + getProbs_CUDA(d_output_tensor.getData(), tmp_probs.getData(), + length, static_cast(thread_per_block), + tensor_network_.getDevTag().getStreamID()); + + // Copy the data to the output tensor + scaleAndAdd_CUDA(PrecisionT{1.0}, tmp_probs.getData(), + d_output_probs.getData(), + tmp_probs.getLength(), + tensor_network_.getDevTag().getDeviceID(), + tensor_network_.getDevTag().getStreamID(), + tensor_network_.getCublasCaller()); + } + } + + // `10` here means `1024` elements to be calculated + // LCOV_EXCL_START + if (wires.size() > 10) { PrecisionT sum; asum_CUDA_device( @@ -144,16 +195,11 @@ template class MeasurementsTNCuda { // number of wires. The CPU calculation is faster than the GPU // calculation for a small number of wires due to the overhead of // the GPU kernel launch. - std::vector h_state_vector(length); - d_output_tensor.CopyGpuDataToHost(h_state_vector.data(), - h_state_vector.size()); - // TODO: OMP support - for (std::size_t i = 0; i < length; i++) { - h_res[i] = std::norm(h_state_vector[i]); - } + d_output_probs.CopyGpuDataToHost(h_res.data(), h_res.size()); // TODO: OMP support - PrecisionT sum = std::accumulate(h_res.begin(), h_res.end(), 0.0); + PrecisionT sum = + std::accumulate(h_res.begin(), h_res.end(), PrecisionT{0.0}); PL_ABORT_IF(sum == 0.0, "Sum of probabilities is zero."); // TODO: OMP support diff --git a/pennylane_lightning/core/src/simulators/lightning_tensor/tncuda/measurements/tests/Test_MPSTNCuda_Measure.cpp b/pennylane_lightning/core/src/simulators/lightning_tensor/tncuda/measurements/tests/Test_MPSTNCuda_Measure.cpp index 74923cf87c..2284c46dc8 100644 --- a/pennylane_lightning/core/src/simulators/lightning_tensor/tncuda/measurements/tests/Test_MPSTNCuda_Measure.cpp +++ b/pennylane_lightning/core/src/simulators/lightning_tensor/tncuda/measurements/tests/Test_MPSTNCuda_Measure.cpp @@ -93,6 +93,18 @@ TEMPLATE_TEST_CASE("Probabilities", "[Measures]", float, double) { auto measure = MeasurementsTNCuda(mps_state); REQUIRE_THROWS_AS(measure.probs({2, 1}), LightningException); } + + SECTION("Test excessive projected wires failure") { + // Defining the State Vector that will be measured. + std::size_t bondDim = GENERATE(2, 3, 4, 5); + std::size_t num_qubits = 100; + std::size_t maxBondDim = bondDim; + + TensorNetT mps_state{num_qubits, maxBondDim}; + + auto measure = MeasurementsTNCuda(mps_state); + REQUIRE_THROWS_AS(measure.probs({0, 1, 2, 3}), LightningException); + } } TEMPLATE_TEST_CASE("Samples", "[Measures]", float, double) { diff --git a/pennylane_lightning/core/src/utils/cuda_utils/LinearAlg.hpp b/pennylane_lightning/core/src/utils/cuda_utils/LinearAlg.hpp index 1dad632532..cd422899b5 100644 --- a/pennylane_lightning/core/src/utils/cuda_utils/LinearAlg.hpp +++ b/pennylane_lightning/core/src/utils/cuda_utils/LinearAlg.hpp @@ -215,6 +215,34 @@ inline auto scaleAndAddC_CUDA(const CFP_t a, const T *v1, T *v2, } } +/** + * @brief cuBLAS backed GPU SAXPY/DAXPY. + * + * @tparam T Float data-type. Accepts float and double + * @param a scaling factor + * @param v1 Device data pointer 1 (data to be modified) + * @param v2 Device data pointer 2 (the result data) + * @param data_size Length of device data. + * @param dev_id the device on which the function should be executed. + * @param stream_id the CUDA stream on which the operation should be executed. + * @param cublas the CublasCaller object that manages the cuBLAS handle. + */ + +template +inline auto scaleAndAdd_CUDA(const T a, const T *v1, T *v2, const int data_size, + DevTypeID dev_id, cudaStream_t stream_id, + const CublasCaller &cublas) { + if constexpr (std::is_same_v) { + const float alpha = a; + cublas.call(cublasSaxpy, dev_id, stream_id, data_size, &alpha, v1, 1, + v2, 1); + } else if constexpr (std::is_same_v) { + const double alpha = a; + cublas.call(cublasDaxpy, dev_id, stream_id, data_size, &alpha, v1, 1, + v2, 1); + } +} + /** * @brief cuBLAS backed GPU data scaling. * diff --git a/tests/lightning_tensor/test_measurements_class.py b/tests/lightning_tensor/test_measurements_class.py index cdae207f8a..1b0e9736d8 100644 --- a/tests/lightning_tensor/test_measurements_class.py +++ b/tests/lightning_tensor/test_measurements_class.py @@ -126,3 +126,26 @@ def test_not_supported_shadowmp_shot_measurements(self): with pytest.raises(TypeError): m.measure_tensor_network(tape) + + @pytest.mark.parametrize("n_qubits", range(4, 14, 2)) + @pytest.mark.parametrize("n_targets", list(range(1, 4)) + list(range(4, 14, 2))) + def test_probs_many_wires(self, n_qubits, n_targets, tol): + """Test probs measuring many wires of a random quantum state.""" + if n_targets >= n_qubits: + pytest.skip("Number of targets cannot exceed the number of wires.") + + dev = qml.device(device_name, wires=n_qubits) + dq = qml.device("default.qubit", wires=n_qubits) + + init_state = np.random.rand(2**n_qubits) + 1.0j * np.random.rand(2**n_qubits) + init_state /= np.linalg.norm(init_state) + + ops = [qml.StatePrep(init_state, wires=range(n_qubits))] + + mp = qml.probs(wires=range(n_targets)) + + tape = qml.tape.QuantumScript(ops, [mp]) + res = dev.execute(tape) + ref = dq.execute(tape) + + assert np.allclose(res, ref, atol=tol, rtol=0)