Skip to content

Commit

Permalink
Fix qml.probs for lightning.tensor (#906)
Browse files Browse the repository at this point in the history
### Before submitting

Please complete the following checklist when submitting a PR:

- [ ] All new features must include a unit test.
If you've fixed a bug or added code that should be tested, add a test to
the
      [`tests`](../tests) directory!

- [ ] All new functions and code must be clearly commented and
documented.
If you do make documentation changes, make sure that the docs build and
      render correctly by running `make docs`.

- [ ] Ensure that the test suite passes, by running `make test`.

- [x] Add a new entry to the `.github/CHANGELOG.md` file, summarizing
the
      change, and including a link back to the PR.

- [x] Ensure that code is properly formatted by running `make format`. 

When all the above are checked, delete everything above the dashed
line and fill in the pull request template.


------------------------------------------------------------------------------------------------------------

**Context:**

[SC-73501]

`cutensornet`'s `Accessor` API allows the users to extract single state
amplitudes (elements of the state tensor), slices of state amplitudes
(slices of the state tensor) as well as the full state tensor. Previous
implementation summed up the slices of the state tensor and compute the
probabilities, which was a bug. This PR sums up the prob of each slices
of the state tensor and fixes the bug.

**Description of the Change:**

**Benefits:**

Analytic probs is supported

**Possible Drawbacks:**

The computation cost could be too expensive to afford if the number of
projected wires is large.

**Related GitHub Issues:**

---------

Co-authored-by: ringo-but-quantum <github-ringo-but-quantum@xanadu.ai>
  • Loading branch information
multiphaseCFD and ringo-but-quantum authored Sep 11, 2024
1 parent ef3a8cc commit dbd2cc4
Show file tree
Hide file tree
Showing 7 changed files with 144 additions and 65 deletions.
3 changes: 3 additions & 0 deletions .github/CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,9 @@

### Bug fixes

* Bug fix for analytic `probs` in the `lightning.tensor` C++ layer.
[(#906)](https://github.com/PennyLaneAI/pennylane-lightning/pull/906)

### Contributors

This release contains contributions from (in alphabetical order):
Expand Down
2 changes: 1 addition & 1 deletion pennylane_lightning/core/_version.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,4 +16,4 @@
Version number (major.minor.patch[-label])
"""

__version__ = "0.39.0-dev20"
__version__ = "0.39.0-dev21"
Original file line number Diff line number Diff line change
Expand Up @@ -406,69 +406,36 @@ class TNCudaBase : public TensornetBase<PrecisionT, Derived> {
*/
void get_state_tensor(ComplexT *host_data,
const int32_t numHyperSamples = 1) {
std::vector<std::size_t> wires(BaseType::getNumQubits());
std::iota(wires.begin(), wires.end(), 0);
std::vector<int32_t> projected_modes{};
std::vector<int64_t> projected_mode_values{};

const std::size_t length = std::size_t{1} << wires.size();
const std::size_t length = std::size_t{1} << BaseType::getNumQubits();

DataBuffer<CFP_t, int> d_output_tensor(length, getDevTag(), true);

get_state_tensor(d_output_tensor.getData(), d_output_tensor.getLength(),
wires, numHyperSamples);
get_accessor_(d_output_tensor.getData(), length, projected_modes,
projected_mode_values, numHyperSamples);

d_output_tensor.CopyGpuDataToHost(host_data, length);
}

/**
* @brief Get a slice of the full state tensor
* @brief Get a slice of the full state tensor.
*
* @param tensor_data Pointer to the device memory for state tensor data.
* @param tensor_data_size Size of the state tensor data.
* @param wires Wires to get the state tensor for.
* @param projected_modes Projected modes to get the state tensor for.
* @param projected_mode_values Values of the projected modes.
* @param numHyperSamples Number of hyper samples to use in the calculation
* and is set to 1 by default.
*/
void get_state_tensor(CFP_t *tensor_data,
const std::size_t tensor_data_size,
const std::vector<std::size_t> &wires,
const std::vector<int32_t> &projected_modes,
const std::vector<int64_t> &projected_mode_values,
const int32_t numHyperSamples = 1) const {
auto stateModes = cuUtil::NormalizeCastIndices<std::size_t, int32_t>(
wires, BaseType::getNumQubits());

std::vector<int32_t> projected_modes{};

for (int32_t idx = 0;
idx < static_cast<int32_t>(BaseType::getNumQubits()); idx++) {
auto it = std::find(stateModes.begin(), stateModes.end(), idx);
if (it == stateModes.end()) {
projected_modes.emplace_back(idx);
}
}

std::vector<int64_t> projectedModeValues(projected_modes.size(), 0);

if (projected_modes.empty()) {
get_accessor_(tensor_data, tensor_data_size, projected_modes,
projectedModeValues, numHyperSamples);
} else {
DataBuffer<CFP_t, int> tmp(tensor_data_size, getDevTag(), true);

const std::size_t projected_modes_size = std::size_t(1)
<< projected_modes.size();
for (std::size_t idx = 0; idx < projected_modes_size; idx++) {
for (std::size_t j = 0; j < projected_modes.size(); j++) {
projectedModeValues[j] = (idx >> j) & 1;
}

get_accessor_(tmp.getData(), tensor_data_size, projected_modes,
projectedModeValues, numHyperSamples);
// Copy the data to the output tensor
scaleAndAddC_CUDA(std::complex<PrecisionT>{1.0, 0.0},
tmp.getData(), tensor_data, tmp.getLength(),
getDevTag().getDeviceID(),
getDevTag().getStreamID(), getCublasCaller());
}
}
get_accessor_(tensor_data, tensor_data_size, projected_modes,
projected_mode_values, numHyperSamples);
}

private:
Expand All @@ -478,13 +445,13 @@ class TNCudaBase : public TensornetBase<PrecisionT, Derived> {
* @param tensor_data Pointer to the device memory for state tensor data.
* @param tensor_data_size Size of the tensor data.
* @param projected_modes Projected modes to get the state tensor for.
* @param projectedModeValues Values of the projected modes.
* @param projected_mode_values Values of the projected modes.
* @param numHyperSamples Number of hyper samples to use in the calculation
* and is set to 1 by default.
*/
void get_accessor_(CFP_t *tensor_data, const std::size_t tensor_data_size,
const std::vector<int32_t> &projected_modes,
const std::vector<int64_t> &projectedModeValues,
const std::vector<int64_t> &projected_mode_values,
const int32_t numHyperSamples = 1) const {
cutensornetStateAccessor_t accessor;
PL_CUTENSORNET_IS_SUCCESS(cutensornetCreateAccessor(
Expand Down Expand Up @@ -543,7 +510,7 @@ class TNCudaBase : public TensornetBase<PrecisionT, Derived> {
/* const cutensornetHandle_t */ getTNCudaHandle(),
/* cutensornetStateAccessor_t */ accessor,
/* const int64_t * projectedModeValues */
projectedModeValues.data(),
projected_mode_values.data(),
/* cutensornetWorkspaceDescriptor_t */ workDesc,
/* void *amplitudesTensor*/
static_cast<void *>(tensor_data),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -107,22 +107,73 @@ template <class TensorNetT> class MeasurementsTNCuda {
DataBuffer<CFP_t, int> d_output_tensor(
length, tensor_network_.getDevTag(), true);

DataBuffer<PrecisionT, int> d_output_probs(
length, tensor_network_.getDevTag(), true);

d_output_tensor.zeroInit();
d_output_probs.zeroInit();

tensor_network_.get_state_tensor(d_output_tensor.getData(),
d_output_tensor.getLength(), wires,
numHyperSamples);
auto stateModes = cuUtil::NormalizeCastIndices<std::size_t, int32_t>(
wires, tensor_network_.getNumQubits());

// `10` here means `1024` elements to be calculated
// LCOV_EXCL_START
if (wires.size() > 10) {
DataBuffer<PrecisionT, int> d_output_probs(
length, tensor_network_.getDevTag(), true);
std::vector<int32_t> projected_modes{};

for (int32_t idx = 0;
idx < static_cast<int32_t>(tensor_network_.getNumQubits());
idx++) {
auto it = std::find(stateModes.begin(), stateModes.end(), idx);
if (it == stateModes.end()) {
projected_modes.emplace_back(idx);
}
}

std::vector<int64_t> projectedModeValues(projected_modes.size(), 0);

if (projected_modes.size() == 0) {
tensor_network_.get_state_tensor(d_output_tensor.getData(),
d_output_tensor.getLength(), {},
{}, numHyperSamples);
getProbs_CUDA(d_output_tensor.getData(), d_output_probs.getData(),
length, static_cast<int>(thread_per_block),
tensor_network_.getDevTag().getStreamID());

} else {
PL_ABORT_IF(projected_modes.size() > 64,
"Number of projected modes is greater than 64 and the "
"value of projected_modes_size will exceed "
"std::numeric_limits<size_t>::max()");
const std::size_t projected_modes_size = std::size_t(1U)
<< projected_modes.size();

DataBuffer<PrecisionT, int> tmp_probs(
length, tensor_network_.getDevTag(), true);

for (std::size_t idx = 0; idx < projected_modes_size; idx++) {
for (std::size_t j = 0; j < projected_modes.size(); j++) {
projectedModeValues[j] = (idx >> j) & 1U;
}

tensor_network_.get_state_tensor(
d_output_tensor.getData(), length, projected_modes,
projectedModeValues, numHyperSamples);

getProbs_CUDA(d_output_tensor.getData(), tmp_probs.getData(),
length, static_cast<int>(thread_per_block),
tensor_network_.getDevTag().getStreamID());

// Copy the data to the output tensor
scaleAndAdd_CUDA(PrecisionT{1.0}, tmp_probs.getData(),
d_output_probs.getData(),
tmp_probs.getLength(),
tensor_network_.getDevTag().getDeviceID(),
tensor_network_.getDevTag().getStreamID(),
tensor_network_.getCublasCaller());
}
}

// `10` here means `1024` elements to be calculated
// LCOV_EXCL_START
if (wires.size() > 10) {
PrecisionT sum;

asum_CUDA_device<PrecisionT>(
Expand All @@ -144,16 +195,11 @@ template <class TensorNetT> class MeasurementsTNCuda {
// number of wires. The CPU calculation is faster than the GPU
// calculation for a small number of wires due to the overhead of
// the GPU kernel launch.
std::vector<ComplexT> h_state_vector(length);
d_output_tensor.CopyGpuDataToHost(h_state_vector.data(),
h_state_vector.size());
// TODO: OMP support
for (std::size_t i = 0; i < length; i++) {
h_res[i] = std::norm(h_state_vector[i]);
}
d_output_probs.CopyGpuDataToHost(h_res.data(), h_res.size());

// TODO: OMP support
PrecisionT sum = std::accumulate(h_res.begin(), h_res.end(), 0.0);
PrecisionT sum =
std::accumulate(h_res.begin(), h_res.end(), PrecisionT{0.0});

PL_ABORT_IF(sum == 0.0, "Sum of probabilities is zero.");
// TODO: OMP support
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,18 @@ TEMPLATE_TEST_CASE("Probabilities", "[Measures]", float, double) {
auto measure = MeasurementsTNCuda<TensorNetT>(mps_state);
REQUIRE_THROWS_AS(measure.probs({2, 1}), LightningException);
}

SECTION("Test excessive projected wires failure") {
// Defining the State Vector that will be measured.
std::size_t bondDim = GENERATE(2, 3, 4, 5);
std::size_t num_qubits = 100;
std::size_t maxBondDim = bondDim;

TensorNetT mps_state{num_qubits, maxBondDim};

auto measure = MeasurementsTNCuda<TensorNetT>(mps_state);
REQUIRE_THROWS_AS(measure.probs({0, 1, 2, 3}), LightningException);
}
}

TEMPLATE_TEST_CASE("Samples", "[Measures]", float, double) {
Expand Down
28 changes: 28 additions & 0 deletions pennylane_lightning/core/src/utils/cuda_utils/LinearAlg.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -215,6 +215,34 @@ inline auto scaleAndAddC_CUDA(const CFP_t a, const T *v1, T *v2,
}
}

/**
* @brief cuBLAS backed GPU SAXPY/DAXPY.
*
* @tparam T Float data-type. Accepts float and double
* @param a scaling factor
* @param v1 Device data pointer 1 (data to be modified)
* @param v2 Device data pointer 2 (the result data)
* @param data_size Length of device data.
* @param dev_id the device on which the function should be executed.
* @param stream_id the CUDA stream on which the operation should be executed.
* @param cublas the CublasCaller object that manages the cuBLAS handle.
*/

template <class T = double, class DevTypeID = int>
inline auto scaleAndAdd_CUDA(const T a, const T *v1, T *v2, const int data_size,
DevTypeID dev_id, cudaStream_t stream_id,
const CublasCaller &cublas) {
if constexpr (std::is_same_v<T, float>) {
const float alpha = a;
cublas.call(cublasSaxpy, dev_id, stream_id, data_size, &alpha, v1, 1,
v2, 1);
} else if constexpr (std::is_same_v<T, double>) {
const double alpha = a;
cublas.call(cublasDaxpy, dev_id, stream_id, data_size, &alpha, v1, 1,
v2, 1);
}
}

/**
* @brief cuBLAS backed GPU data scaling.
*
Expand Down
23 changes: 23 additions & 0 deletions tests/lightning_tensor/test_measurements_class.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,3 +126,26 @@ def test_not_supported_shadowmp_shot_measurements(self):

with pytest.raises(TypeError):
m.measure_tensor_network(tape)

@pytest.mark.parametrize("n_qubits", range(4, 14, 2))
@pytest.mark.parametrize("n_targets", list(range(1, 4)) + list(range(4, 14, 2)))
def test_probs_many_wires(self, n_qubits, n_targets, tol):
"""Test probs measuring many wires of a random quantum state."""
if n_targets >= n_qubits:
pytest.skip("Number of targets cannot exceed the number of wires.")

dev = qml.device(device_name, wires=n_qubits)
dq = qml.device("default.qubit", wires=n_qubits)

init_state = np.random.rand(2**n_qubits) + 1.0j * np.random.rand(2**n_qubits)
init_state /= np.linalg.norm(init_state)

ops = [qml.StatePrep(init_state, wires=range(n_qubits))]

mp = qml.probs(wires=range(n_targets))

tape = qml.tape.QuantumScript(ops, [mp])
res = dev.execute(tape)
ref = dq.execute(tape)

assert np.allclose(res, ref, atol=tol, rtol=0)

0 comments on commit dbd2cc4

Please sign in to comment.