Skip to content

Commit

Permalink
wip debug
Browse files Browse the repository at this point in the history
  • Loading branch information
Hanting Zhang committed Feb 9, 2024
1 parent 1a0119a commit 49d57f5
Showing 1 changed file with 103 additions and 75 deletions.
178 changes: 103 additions & 75 deletions msm/pippenger.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -357,13 +357,26 @@ template <class bucket_t, class point_t, class affine_t, class scalar_t,
class msm_t
{
const gpu_t &gpu;

// main data
bool owned;
affine_h *d_points;
scalar_t *d_scalars;
uint32_t *d_pidx;
size_t npoints;
size_t nscalars;

// per setup constants
uint32_t wbits, nwins;
uint32_t batch;
uint32_t stride;

// auxiliary space
char *d_total_blob;
bucket_h *d_buckets;
affine_h *d_points;
scalar_t *d_scalars;
vec2d_t<uint32_t> d_hist;
bool owned;
vec2d_t<uint2> d_temps;
vec2d_t<uint32_t> d_digits;

template <typename T>
using vec_t = slice_t<T>;
Expand All @@ -387,60 +400,49 @@ class msm_t
}

public:
msm_t(const affine_t points[], size_t np, bool owned,
size_t ffi_affine_sz = sizeof(affine_t), int device_id = -1)
: owned(owned), gpu(select_gpu(device_id)), d_points(nullptr), d_scalars(nullptr)
// Initialize the MSM by moving the points to the device
msm_t(const affine_t points[], size_t npoints, bool owned, int device_id = -1) : gpu(select_gpu(device_id))
{
npoints = (np + WARP_SZ - 1) & ((size_t)0 - WARP_SZ);
// set default values for fields
this->d_points = nullptr;
this->d_scalars = nullptr;
this->d_pidx = nullptr;
this->npoints = npoints;
this->owned = owned;

wbits = 17;
if (npoints > 192)
{
wbits = std::min(lg2(npoints + npoints / 2) - 8, 18);
if (wbits < 10)
wbits = 10;
}
else if (npoints > 0)
{
wbits = 10;
}
nwins = (scalar_t::bit_length() - 1) / wbits + 1;
this->d_total_blob = nullptr;

uint32_t row_sz = 1U << (wbits - 1);

size_t d_buckets_sz = (nwins * row_sz) + (gpu.sm_count() * BATCH_ADD_BLOCK_SIZE / WARP_SZ);
size_t d_blob_sz = (d_buckets_sz * sizeof(d_buckets[0])) + (nwins * row_sz * sizeof(uint32_t));
d_points = reinterpret_cast<decltype(d_points)>(gpu.Dmalloc(npoints * sizeof(d_points[0])));
gpu.HtoD(d_points, points, npoints, sizeof(affine_h));
CUDA_OK(cudaGetLastError());
}

d_buckets = reinterpret_cast<decltype(d_buckets)>(gpu.Dmalloc(d_blob_sz));
d_hist = vec2d_t<uint32_t>(&d_buckets[d_buckets_sz], row_sz);
if (points)
{
d_points = reinterpret_cast<decltype(d_points)>(gpu.Dmalloc(points ? npoints * sizeof(d_points[0]) : 0));
gpu.HtoD(d_points, points, np, ffi_affine_sz);
}
msm_t(affine_h *d_points, size_t npoints, int device_id = -1) : gpu(select_gpu(device_id))
{
// set default values for fields
this->d_points = d_points;
this->d_scalars = nullptr;
this->d_pidx = nullptr;
this->npoints = npoints;
this->owned = false;

if (owned)
npoints = 0;
else
npoints = np;
this->d_total_blob = nullptr;
}
inline msm_t(vec_t<affine_t> points, size_t ffi_affine_sz = sizeof(affine_t),
int device_id = -1)
: msm_t(points, points.size(), ffi_affine_sz, device_id){};
inline msm_t(int device_id = -1)
: msm_t(nullptr, 0, 0, device_id){};

~msm_t()
{
gpu.sync();
if (d_buckets)
gpu.Dfree(d_buckets);
if (d_total_blob)
gpu.Dfree(d_total_blob);
if (d_points && owned)
gpu.Dfree(d_points);
}

affine_h *get_d_points()
{
return d_points;
}

void set_d_points(affine_h *d_points)
{
assert(!this->owned);
Expand Down Expand Up @@ -499,19 +501,69 @@ private:
}

public:
RustError invoke(point_t &out, const affine_t *points_, size_t npoints,
const scalar_t *scalars, bool mont = true,
size_t ffi_affine_sz = sizeof(affine_t))
// Compute various constants (stride length, window size) based on the number of scalars.
// Also allocate scratch space.
void setup_scratch(size_t npoints)
{
assert(this->npoints == 0 || npoints <= this->npoints);
this->npoints = npoints;

uint32_t lg_n = lg2(npoints + npoints / 2);

wbits = 17;
if (npoints > 192)
{
wbits = std::min(lg_n - 8, (uint32_t)18);
if (wbits < 10)
wbits = 10;
}
else if (npoints > 0)
{
wbits = 10;
}
nwins = (scalar_t::bit_length() - 1) / wbits + 1;

uint32_t row_sz = 1U << (wbits - 1);

size_t d_buckets_sz = (nwins * row_sz) + (gpu.sm_count() * BATCH_ADD_BLOCK_SIZE / WARP_SZ);
d_buckets_sz *= sizeof(d_buckets[0]);
size_t d_hist_sz = nwins * row_sz * sizeof(uint32_t);

uint32_t lg_npoints = lg2(npoints + npoints / 2);
size_t batch = 1 << (std::max(lg_npoints, wbits) - wbits);
this->batch = 1 << (std::max(lg_n, wbits) - wbits);
batch >>= 6;
batch = batch ? batch : 1;
uint32_t stride = (npoints + batch - 1) / batch;
this->stride = (npoints + batch - 1) / batch;
stride = (stride + WARP_SZ - 1) & ((size_t)0 - WARP_SZ);

size_t temp_sz = stride * std::max(2 * sizeof(uint2), sizeof(scalar_t));
size_t digits_sz = nwins * stride * sizeof(uint32_t);
// size_t pidx_sz = pidx ? stride * sizeof(uint32_t) : 0;

size_t d_blob_sz = d_buckets_sz + d_hist_sz + temp_sz + digits_sz; // + pidx_sz;

d_total_blob = reinterpret_cast<char *>(gpu.Dmalloc(d_blob_sz));
size_t offset = 0;
d_buckets = reinterpret_cast<decltype(d_buckets)>(&d_total_blob[offset]);
offset += d_buckets_sz;
d_hist = vec2d_t<uint32_t>((uint32_t *)&d_total_blob[offset], row_sz);
offset += d_hist_sz;

d_temps = vec2d_t<uint2>((uint2 *)&d_total_blob[offset], stride);
d_scalars = (scalar_t *)&d_total_blob[offset];
offset += temp_sz;
d_digits = vec2d_t<uint32_t>((uint32_t *)&d_total_blob[offset], stride);
offset += digits_sz;
// if (pidx)
// d_pidx = (uint32_t *)&d_total_blob[offset];
}

RustError invoke(point_t &out, const affine_t *points, size_t npoints,
const scalar_t *scalars, bool mont = true,
size_t ffi_affine_sz = sizeof(affine_t))
{
assert(this->npoints == 0 || npoints <= this->npoints);

setup_scratch(npoints);

std::vector<result_t> res(nwins);
std::vector<bucket_t> ones(gpu.sm_count() * BATCH_ADD_BLOCK_SIZE / WARP_SZ);

Expand All @@ -520,29 +572,6 @@ public:

try
{
// |scalars| being nullptr means the scalars are pre-loaded to
// |d_scalars|, otherwise allocate stride.
size_t temp_sz = scalars ? sizeof(scalar_t) : 0;
temp_sz = stride * std::max(2 * sizeof(uint2), temp_sz);

// |points| being nullptr means the points are pre-loaded to
// |d_points|, otherwise allocate double-stride.
const char *points = reinterpret_cast<const char *>(points_);
size_t d_point_sz = points ? (batch > 1 ? 2 * stride : stride) : 0;
d_point_sz *= sizeof(affine_h);

size_t digits_sz = nwins * stride * sizeof(uint32_t);

dev_ptr_t<uint8_t> d_temp{temp_sz + digits_sz + d_point_sz, gpu[2]};

vec2d_t<uint2> d_temps{&d_temp[0], stride};
vec2d_t<uint32_t> d_digits{&d_temp[temp_sz], stride};

scalar_t *d_scalars = scalars ? (scalar_t *)&d_temp[0]
: this->d_scalars;
affine_h *d_points = points ? (affine_h *)&d_temp[temp_sz + digits_sz]
: this->d_points;

size_t d_off = 0; // device offset
size_t h_off = 0; // host offset
size_t num = stride > npoints ? npoints : stride;
Expand Down Expand Up @@ -823,8 +852,8 @@ static RustError mult_pippenger(point_t *out, const affine_t points[], size_t np
{
try
{
msm_t<bucket_t, point_t, affine_t, scalar_t> msm{nullptr, npoints, true};
return msm.invoke(*out, slice_t<affine_t>{points, npoints},
msm_t<bucket_t, point_t, affine_t, scalar_t> msm{points, npoints, true};
return msm.invoke(*out, nullptr, npoints,
scalars, mont, ffi_affine_sz);
}
catch (const cuda_error &e)
Expand All @@ -847,8 +876,7 @@ static RustError mult_pippenger_with(point_t *out, msm_context_t<affine_h> *msm_
{
try
{
msm_t<bucket_t, point_t, affine_t, scalar_t> msm{nullptr, npoints, false};
msm.set_d_points(msm_context->d_points);
msm_t<bucket_t, point_t, affine_t, scalar_t> msm{msm_context->d_points, npoints};
return msm.invoke(*out, nullptr, npoints,
scalars, mont, ffi_affine_sz);
}
Expand Down

0 comments on commit 49d57f5

Please sign in to comment.