Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix qml.probs for lightning.tensor #906

Merged
merged 13 commits into from
Sep 11, 2024
Merged
3 changes: 3 additions & 0 deletions .github/CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,9 @@

### Bug fixes

* Bug fix for analytic `probs` in the `lightning.tensor` C++ layer.
[(#906)](https://github.com/PennyLaneAI/pennylane-lightning/pull/906)

### Contributors

This release contains contributions from (in alphabetical order):
Expand Down
2 changes: 1 addition & 1 deletion pennylane_lightning/core/_version.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,4 +16,4 @@
Version number (major.minor.patch[-label])
"""

__version__ = "0.39.0-dev19"
__version__ = "0.39.0-dev20"
Original file line number Diff line number Diff line change
Expand Up @@ -406,69 +406,36 @@ class TNCudaBase : public TensornetBase<PrecisionT, Derived> {
*/
void get_state_tensor(ComplexT *host_data,
const int32_t numHyperSamples = 1) {
std::vector<std::size_t> wires(BaseType::getNumQubits());
std::iota(wires.begin(), wires.end(), 0);
std::vector<int32_t> projected_modes{};
std::vector<int64_t> projected_mode_values{};

const std::size_t length = std::size_t{1} << wires.size();
const std::size_t length = std::size_t{1} << BaseType::getNumQubits();

DataBuffer<CFP_t, int> d_output_tensor(length, getDevTag(), true);

get_state_tensor(d_output_tensor.getData(), d_output_tensor.getLength(),
wires, numHyperSamples);
get_accessor_(d_output_tensor.getData(), length, projected_modes,
projected_mode_values, numHyperSamples);

d_output_tensor.CopyGpuDataToHost(host_data, length);
}

/**
* @brief Get a slice of the full state tensor
* @brief Get a slice of the full state tensor.
*
* @param tensor_data Pointer to the device memory for state tensor data.
* @param tensor_data_size Size of the state tensor data.
* @param wires Wires to get the state tensor for.
* @param projected_modes Projected modes to get the state tensor for.
* @param projected_mode_values Values of the projected modes.
* @param numHyperSamples Number of hyper samples to use in the calculation
* and is set to 1 by default.
*/
void get_state_tensor(CFP_t *tensor_data,
const std::size_t tensor_data_size,
const std::vector<std::size_t> &wires,
const std::vector<int32_t> &projected_modes,
const std::vector<int64_t> &projected_mode_values,
const int32_t numHyperSamples = 1) const {
auto stateModes = cuUtil::NormalizeCastIndices<std::size_t, int32_t>(
wires, BaseType::getNumQubits());

std::vector<int32_t> projected_modes{};

for (int32_t idx = 0;
idx < static_cast<int32_t>(BaseType::getNumQubits()); idx++) {
auto it = std::find(stateModes.begin(), stateModes.end(), idx);
if (it == stateModes.end()) {
projected_modes.emplace_back(idx);
}
}

std::vector<int64_t> projectedModeValues(projected_modes.size(), 0);

if (projected_modes.empty()) {
get_accessor_(tensor_data, tensor_data_size, projected_modes,
projectedModeValues, numHyperSamples);
} else {
DataBuffer<CFP_t, int> tmp(tensor_data_size, getDevTag(), true);

const std::size_t projected_modes_size = std::size_t(1)
<< projected_modes.size();
for (std::size_t idx = 0; idx < projected_modes_size; idx++) {
for (std::size_t j = 0; j < projected_modes.size(); j++) {
projectedModeValues[j] = (idx >> j) & 1;
}

get_accessor_(tmp.getData(), tensor_data_size, projected_modes,
projectedModeValues, numHyperSamples);
// Copy the data to the output tensor
scaleAndAddC_CUDA(std::complex<PrecisionT>{1.0, 0.0},
tmp.getData(), tensor_data, tmp.getLength(),
getDevTag().getDeviceID(),
getDevTag().getStreamID(), getCublasCaller());
}
}
get_accessor_(tensor_data, tensor_data_size, projected_modes,
projected_mode_values, numHyperSamples);
}

private:
Expand All @@ -478,13 +445,13 @@ class TNCudaBase : public TensornetBase<PrecisionT, Derived> {
* @param tensor_data Pointer to the device memory for state tensor data.
* @param tensor_data_size Size of the tensor data.
* @param projected_modes Projected modes to get the state tensor for.
* @param projectedModeValues Values of the projected modes.
* @param projected_mode_values Values of the projected modes.
* @param numHyperSamples Number of hyper samples to use in the calculation
* and is set to 1 by default.
*/
void get_accessor_(CFP_t *tensor_data, const std::size_t tensor_data_size,
const std::vector<int32_t> &projected_modes,
const std::vector<int64_t> &projectedModeValues,
const std::vector<int64_t> &projected_mode_values,
const int32_t numHyperSamples = 1) const {
cutensornetStateAccessor_t accessor;
PL_CUTENSORNET_IS_SUCCESS(cutensornetCreateAccessor(
Expand Down Expand Up @@ -543,7 +510,7 @@ class TNCudaBase : public TensornetBase<PrecisionT, Derived> {
/* const cutensornetHandle_t */ getTNCudaHandle(),
/* cutensornetStateAccessor_t */ accessor,
/* const int64_t * projectedModeValues */
projectedModeValues.data(),
projected_mode_values.data(),
/* cutensornetWorkspaceDescriptor_t */ workDesc,
/* void *amplitudesTensor*/
static_cast<void *>(tensor_data),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -107,22 +107,71 @@ template <class TensorNetT> class MeasurementsTNCuda {
DataBuffer<CFP_t, int> d_output_tensor(
length, tensor_network_.getDevTag(), true);

DataBuffer<PrecisionT, int> d_output_probs(
length, tensor_network_.getDevTag(), true);

d_output_tensor.zeroInit();
d_output_probs.zeroInit();

tensor_network_.get_state_tensor(d_output_tensor.getData(),
d_output_tensor.getLength(), wires,
numHyperSamples);
auto stateModes = cuUtil::NormalizeCastIndices<std::size_t, int32_t>(
wires, tensor_network_.getNumQubits());

// `10` here means `1024` elements to be calculated
// LCOV_EXCL_START
if (wires.size() > 10) {
DataBuffer<PrecisionT, int> d_output_probs(
length, tensor_network_.getDevTag(), true);
std::vector<int32_t> projected_modes{};

for (int32_t idx = 0;
idx < static_cast<int32_t>(tensor_network_.getNumQubits());
idx++) {
multiphaseCFD marked this conversation as resolved.
Show resolved Hide resolved
auto it = std::find(stateModes.begin(), stateModes.end(), idx);
if (it == stateModes.end()) {
projected_modes.emplace_back(idx);
}
}

std::vector<int64_t> projectedModeValues(projected_modes.size(), 0);

if (projected_modes.size() == 0) {
tensor_network_.get_state_tensor(d_output_tensor.getData(),
d_output_tensor.getLength(), {},
{}, numHyperSamples);
getProbs_CUDA(d_output_tensor.getData(), d_output_probs.getData(),
length, static_cast<int>(thread_per_block),
tensor_network_.getDevTag().getStreamID());

} else {
PL_ABORT_IF(projected_modes.size() > 64,
"Number of projected modes is greater than 64.");
const std::size_t projected_modes_size = std::size_t(1)
multiphaseCFD marked this conversation as resolved.
Show resolved Hide resolved
<< projected_modes.size();

DataBuffer<PrecisionT, int> tmp_probs(
length, tensor_network_.getDevTag(), true);

for (std::size_t idx = 0; idx < projected_modes_size; idx++) {
multiphaseCFD marked this conversation as resolved.
Show resolved Hide resolved
for (std::size_t j = 0; j < projected_modes.size(); j++) {
projectedModeValues[j] = (idx >> j) & 1;
multiphaseCFD marked this conversation as resolved.
Show resolved Hide resolved
}

tensor_network_.get_state_tensor(
d_output_tensor.getData(), length, projected_modes,
projectedModeValues, numHyperSamples);

getProbs_CUDA(d_output_tensor.getData(), tmp_probs.getData(),
length, static_cast<int>(thread_per_block),
tensor_network_.getDevTag().getStreamID());

// Copy the data to the output tensor
scaleAndAdd_CUDA(PrecisionT{1.0}, tmp_probs.getData(),
d_output_probs.getData(),
tmp_probs.getLength(),
tensor_network_.getDevTag().getDeviceID(),
tensor_network_.getDevTag().getStreamID(),
tensor_network_.getCublasCaller());
}
}

// `10` here means `1024` elements to be calculated
// LCOV_EXCL_START
if (wires.size() > 10) {
PrecisionT sum;

asum_CUDA_device<PrecisionT>(
Expand All @@ -144,16 +193,11 @@ template <class TensorNetT> class MeasurementsTNCuda {
// number of wires. The CPU calculation is faster than the GPU
// calculation for a small number of wires due to the overhead of
// the GPU kernel launch.
std::vector<ComplexT> h_state_vector(length);
d_output_tensor.CopyGpuDataToHost(h_state_vector.data(),
h_state_vector.size());
// TODO: OMP support
for (std::size_t i = 0; i < length; i++) {
h_res[i] = std::norm(h_state_vector[i]);
}
d_output_probs.CopyGpuDataToHost(h_res.data(), h_res.size());

// TODO: OMP support
PrecisionT sum = std::accumulate(h_res.begin(), h_res.end(), 0.0);
PrecisionT sum =
std::accumulate(h_res.begin(), h_res.end(), PrecisionT{0.0});

PL_ABORT_IF(sum == 0.0, "Sum of probabilities is zero.");
// TODO: OMP support
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,18 @@ TEMPLATE_TEST_CASE("Probabilities", "[Measures]", float, double) {
auto measure = MeasurementsTNCuda<TensorNetT>(mps_state);
REQUIRE_THROWS_AS(measure.probs({2, 1}), LightningException);
}

SECTION("Test excessive projected wires failure") {
// Defining the State Vector that will be measured.
std::size_t bondDim = GENERATE(2, 3, 4, 5);
std::size_t num_qubits = 100;
std::size_t maxBondDim = bondDim;

TensorNetT mps_state{num_qubits, maxBondDim};

auto measure = MeasurementsTNCuda<TensorNetT>(mps_state);
REQUIRE_THROWS_AS(measure.probs({0, 1, 2, 3}), LightningException);
}
}

TEMPLATE_TEST_CASE("Samples", "[Measures]", float, double) {
Expand Down
28 changes: 28 additions & 0 deletions pennylane_lightning/core/src/utils/cuda_utils/LinearAlg.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -215,6 +215,34 @@ inline auto scaleAndAddC_CUDA(const CFP_t a, const T *v1, T *v2,
}
}

/**
* @brief cuBLAS backed GPU SAXPY/DAXPY.
*
* @tparam T Float data-type. Accepts float and double
* @param a scaling factor
* @param v1 Device data pointer 1 (data to be modified)
* @param v2 Device data pointer 2 (the result data)
* @param data_size Length of device data.
* @param dev_id the device on which the function should be executed.
* @param stream_id the CUDA stream on which the operation should be executed.
* @param cublas the CublasCaller object that manages the cuBLAS handle.
*/

template <class T = double, class DevTypeID = int>
inline auto scaleAndAdd_CUDA(const T a, const T *v1, T *v2, const int data_size,
multiphaseCFD marked this conversation as resolved.
Show resolved Hide resolved
DevTypeID dev_id, cudaStream_t stream_id,
const CublasCaller &cublas) {
if constexpr (std::is_same_v<T, float>) {
const float alpha = a;
cublas.call(cublasSaxpy, dev_id, stream_id, data_size, &alpha, v1, 1,
v2, 1);
} else if constexpr (std::is_same_v<T, double>) {
const double alpha = a;
cublas.call(cublasDaxpy, dev_id, stream_id, data_size, &alpha, v1, 1,
v2, 1);
}
}

/**
* @brief cuBLAS backed GPU data scaling.
*
Expand Down
23 changes: 23 additions & 0 deletions tests/lightning_tensor/test_measurements_class.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,3 +126,26 @@ def test_not_supported_shadowmp_shot_measurements(self):

with pytest.raises(TypeError):
m.measure_tensor_network(tape)

@pytest.mark.parametrize("n_qubits", range(4, 14, 2))
@pytest.mark.parametrize("n_targets", list(range(1, 4)) + list(range(4, 14, 2)))
def test_probs_many_wires(self, n_qubits, n_targets, tol):
multiphaseCFD marked this conversation as resolved.
Show resolved Hide resolved
"""Test probs measuring many wires of a random quantum state."""
if n_targets >= n_qubits:
pytest.skip("Number of targets cannot exceed the number of wires.")

dev = qml.device(device_name, wires=n_qubits)
dq = qml.device("default.qubit", wires=n_qubits)

init_state = np.random.rand(2**n_qubits) + 1.0j * np.random.rand(2**n_qubits)
init_state /= np.linalg.norm(init_state)

ops = [qml.StatePrep(init_state, wires=range(n_qubits))]

mp = qml.probs(wires=range(n_targets))

tape = qml.tape.QuantumScript(ops, [mp])
res = dev.execute(tape)
ref = dq.execute(tape)

assert np.allclose(res, ref, atol=tol, rtol=0)
Loading